- Central config with logging on startup - Hot reload support for GPU worker (docker-compose.gpu-dev.yml) - Quick deploy script (scripts/gpu-dev-deploy.sh) - GPU/CPU frame compatibility tests - CI/CD pipeline for GPU worker (.gitea/workflows/gpu-worker.yml) - Standardize GPU_PERSIST default to 0 across all modules Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
221 lines
6.7 KiB
Python
221 lines
6.7 KiB
Python
"""
|
|
GPU-Accelerated Blending Primitives Library
|
|
|
|
Uses CuPy for CUDA-accelerated image blending and compositing.
|
|
Keeps frames on GPU when STREAMING_GPU_PERSIST=1 for maximum performance.
|
|
"""
|
|
import os
|
|
import numpy as np
|
|
|
|
# Try to import CuPy for GPU acceleration
|
|
try:
|
|
import cupy as cp
|
|
GPU_AVAILABLE = True
|
|
print("[blending_gpu] CuPy GPU acceleration enabled")
|
|
except ImportError:
|
|
cp = np
|
|
GPU_AVAILABLE = False
|
|
print("[blending_gpu] CuPy not available, using CPU fallback")
|
|
|
|
# GPU persistence mode - keep frames on GPU between operations
|
|
GPU_PERSIST = os.environ.get("STREAMING_GPU_PERSIST", "0") == "1"
|
|
if GPU_AVAILABLE and GPU_PERSIST:
|
|
print("[blending_gpu] GPU persistence enabled - frames stay on GPU")
|
|
|
|
|
|
def _to_gpu(img):
|
|
"""Move image to GPU if available."""
|
|
if GPU_AVAILABLE and not isinstance(img, cp.ndarray):
|
|
return cp.asarray(img)
|
|
return img
|
|
|
|
|
|
def _to_cpu(img):
|
|
"""Move image back to CPU (only if GPU_PERSIST is disabled)."""
|
|
if not GPU_PERSIST and GPU_AVAILABLE and isinstance(img, cp.ndarray):
|
|
return cp.asnumpy(img)
|
|
return img
|
|
|
|
|
|
def _get_xp(img):
|
|
"""Get the array module (numpy or cupy) for the given image."""
|
|
if GPU_AVAILABLE and isinstance(img, cp.ndarray):
|
|
return cp
|
|
return np
|
|
|
|
|
|
def prim_blend_images(a, b, alpha):
|
|
"""Blend two images: a * (1-alpha) + b * alpha."""
|
|
alpha = max(0.0, min(1.0, float(alpha)))
|
|
|
|
if GPU_AVAILABLE:
|
|
a_gpu = _to_gpu(a)
|
|
b_gpu = _to_gpu(b)
|
|
result = (a_gpu.astype(cp.float32) * (1 - alpha) + b_gpu.astype(cp.float32) * alpha).astype(cp.uint8)
|
|
return _to_cpu(result)
|
|
|
|
return (a.astype(float) * (1 - alpha) + b.astype(float) * alpha).astype(np.uint8)
|
|
|
|
|
|
def prim_blend_mode(a, b, mode):
|
|
"""Blend using Photoshop-style blend modes."""
|
|
if GPU_AVAILABLE:
|
|
a_gpu = _to_gpu(a).astype(cp.float32) / 255
|
|
b_gpu = _to_gpu(b).astype(cp.float32) / 255
|
|
xp = cp
|
|
else:
|
|
a_gpu = a.astype(float) / 255
|
|
b_gpu = b.astype(float) / 255
|
|
xp = np
|
|
|
|
if mode == "multiply":
|
|
result = a_gpu * b_gpu
|
|
elif mode == "screen":
|
|
result = 1 - (1 - a_gpu) * (1 - b_gpu)
|
|
elif mode == "overlay":
|
|
mask = a_gpu < 0.5
|
|
result = xp.where(mask, 2 * a_gpu * b_gpu, 1 - 2 * (1 - a_gpu) * (1 - b_gpu))
|
|
elif mode == "soft-light":
|
|
mask = b_gpu < 0.5
|
|
result = xp.where(mask,
|
|
a_gpu - (1 - 2 * b_gpu) * a_gpu * (1 - a_gpu),
|
|
a_gpu + (2 * b_gpu - 1) * (xp.sqrt(a_gpu) - a_gpu))
|
|
elif mode == "hard-light":
|
|
mask = b_gpu < 0.5
|
|
result = xp.where(mask, 2 * a_gpu * b_gpu, 1 - 2 * (1 - a_gpu) * (1 - b_gpu))
|
|
elif mode == "color-dodge":
|
|
result = xp.clip(a_gpu / (1 - b_gpu + 0.001), 0, 1)
|
|
elif mode == "color-burn":
|
|
result = 1 - xp.clip((1 - a_gpu) / (b_gpu + 0.001), 0, 1)
|
|
elif mode == "difference":
|
|
result = xp.abs(a_gpu - b_gpu)
|
|
elif mode == "exclusion":
|
|
result = a_gpu + b_gpu - 2 * a_gpu * b_gpu
|
|
elif mode == "add":
|
|
result = xp.clip(a_gpu + b_gpu, 0, 1)
|
|
elif mode == "subtract":
|
|
result = xp.clip(a_gpu - b_gpu, 0, 1)
|
|
elif mode == "darken":
|
|
result = xp.minimum(a_gpu, b_gpu)
|
|
elif mode == "lighten":
|
|
result = xp.maximum(a_gpu, b_gpu)
|
|
else:
|
|
# Default to normal (just return b)
|
|
result = b_gpu
|
|
|
|
result = (result * 255).astype(xp.uint8)
|
|
return _to_cpu(result)
|
|
|
|
|
|
def prim_mask(img, mask_img):
|
|
"""Apply grayscale mask to image (white=opaque, black=transparent)."""
|
|
if GPU_AVAILABLE:
|
|
img_gpu = _to_gpu(img)
|
|
mask_gpu = _to_gpu(mask_img)
|
|
|
|
if len(mask_gpu.shape) == 3:
|
|
mask = mask_gpu[:, :, 0].astype(cp.float32) / 255
|
|
else:
|
|
mask = mask_gpu.astype(cp.float32) / 255
|
|
|
|
mask = mask[:, :, cp.newaxis]
|
|
result = (img_gpu.astype(cp.float32) * mask).astype(cp.uint8)
|
|
return _to_cpu(result)
|
|
|
|
if len(mask_img.shape) == 3:
|
|
mask = mask_img[:, :, 0].astype(float) / 255
|
|
else:
|
|
mask = mask_img.astype(float) / 255
|
|
|
|
mask = mask[:, :, np.newaxis]
|
|
return (img.astype(float) * mask).astype(np.uint8)
|
|
|
|
|
|
def prim_alpha_composite(base, overlay, alpha_channel):
|
|
"""Composite overlay onto base using alpha channel."""
|
|
if GPU_AVAILABLE:
|
|
base_gpu = _to_gpu(base)
|
|
overlay_gpu = _to_gpu(overlay)
|
|
alpha_gpu = _to_gpu(alpha_channel)
|
|
|
|
if len(alpha_gpu.shape) == 3:
|
|
alpha = alpha_gpu[:, :, 0].astype(cp.float32) / 255
|
|
else:
|
|
alpha = alpha_gpu.astype(cp.float32) / 255
|
|
|
|
alpha = alpha[:, :, cp.newaxis]
|
|
result = base_gpu.astype(cp.float32) * (1 - alpha) + overlay_gpu.astype(cp.float32) * alpha
|
|
return _to_cpu(result.astype(cp.uint8))
|
|
|
|
if len(alpha_channel.shape) == 3:
|
|
alpha = alpha_channel[:, :, 0].astype(float) / 255
|
|
else:
|
|
alpha = alpha_channel.astype(float) / 255
|
|
|
|
alpha = alpha[:, :, np.newaxis]
|
|
result = base.astype(float) * (1 - alpha) + overlay.astype(float) * alpha
|
|
return result.astype(np.uint8)
|
|
|
|
|
|
def prim_overlay(base, overlay, x, y, alpha=1.0):
|
|
"""Overlay image at position (x, y) with optional alpha."""
|
|
if GPU_AVAILABLE:
|
|
base_gpu = _to_gpu(base)
|
|
overlay_gpu = _to_gpu(overlay)
|
|
result = base_gpu.copy()
|
|
|
|
x, y = int(x), int(y)
|
|
oh, ow = overlay_gpu.shape[:2]
|
|
bh, bw = base_gpu.shape[:2]
|
|
|
|
# Clip to bounds
|
|
sx1 = max(0, -x)
|
|
sy1 = max(0, -y)
|
|
dx1 = max(0, x)
|
|
dy1 = max(0, y)
|
|
sx2 = min(ow, bw - x)
|
|
sy2 = min(oh, bh - y)
|
|
|
|
if sx2 > sx1 and sy2 > sy1:
|
|
src = overlay_gpu[sy1:sy2, sx1:sx2]
|
|
dst = result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)]
|
|
blended = (dst.astype(cp.float32) * (1 - alpha) + src.astype(cp.float32) * alpha)
|
|
result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)] = blended.astype(cp.uint8)
|
|
|
|
return _to_cpu(result)
|
|
|
|
result = base.copy()
|
|
x, y = int(x), int(y)
|
|
oh, ow = overlay.shape[:2]
|
|
bh, bw = base.shape[:2]
|
|
|
|
# Clip to bounds
|
|
sx1 = max(0, -x)
|
|
sy1 = max(0, -y)
|
|
dx1 = max(0, x)
|
|
dy1 = max(0, y)
|
|
sx2 = min(ow, bw - x)
|
|
sy2 = min(oh, bh - y)
|
|
|
|
if sx2 > sx1 and sy2 > sy1:
|
|
src = overlay[sy1:sy2, sx1:sx2]
|
|
dst = result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)]
|
|
blended = (dst.astype(float) * (1 - alpha) + src.astype(float) * alpha)
|
|
result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)] = blended.astype(np.uint8)
|
|
|
|
return result
|
|
|
|
|
|
PRIMITIVES = {
|
|
# Basic blending
|
|
'blend-images': prim_blend_images,
|
|
'blend-mode': prim_blend_mode,
|
|
|
|
# Masking
|
|
'mask': prim_mask,
|
|
'alpha-composite': prim_alpha_composite,
|
|
|
|
# Overlay
|
|
'overlay': prim_overlay,
|
|
}
|