Files
celery/sexp_effects/primitive_libs/geometry_gpu.py
giles fe6730ce72
Some checks are pending
GPU Worker CI/CD / test (push) Waiting to run
GPU Worker CI/CD / deploy (push) Blocked by required conditions
Add dev infrastructure improvements
- Central config with logging on startup
- Hot reload support for GPU worker (docker-compose.gpu-dev.yml)
- Quick deploy script (scripts/gpu-dev-deploy.sh)
- GPU/CPU frame compatibility tests
- CI/CD pipeline for GPU worker (.gitea/workflows/gpu-worker.yml)
- Standardize GPU_PERSIST default to 0 across all modules

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 21:56:40 +00:00

410 lines
12 KiB
Python

"""
GPU-Accelerated Geometry Primitives Library
Uses CuPy for CUDA-accelerated image transforms.
Falls back to CPU if GPU unavailable.
Performance Mode:
- Set STREAMING_GPU_PERSIST=1 to keep frames on GPU between operations
- This dramatically improves performance by avoiding CPU<->GPU transfers
- Frames only transfer to CPU at final output
"""
import os
import numpy as np
# Try to import CuPy for GPU acceleration
try:
import cupy as cp
from cupyx.scipy import ndimage as cpndimage
GPU_AVAILABLE = True
print("[geometry_gpu] CuPy GPU acceleration enabled")
except ImportError:
cp = np
GPU_AVAILABLE = False
print("[geometry_gpu] CuPy not available, using CPU fallback")
# GPU persistence mode - keep frames on GPU between operations
# Set STREAMING_GPU_PERSIST=1 for maximum performance
GPU_PERSIST = os.environ.get("STREAMING_GPU_PERSIST", "0") == "1"
if GPU_AVAILABLE and GPU_PERSIST:
print("[geometry_gpu] GPU persistence enabled - frames stay on GPU")
def _to_gpu(img):
"""Move image to GPU if available."""
if GPU_AVAILABLE and not isinstance(img, cp.ndarray):
return cp.asarray(img)
return img
def _to_cpu(img):
"""Move image back to CPU (only if GPU_PERSIST is disabled)."""
if not GPU_PERSIST and GPU_AVAILABLE and isinstance(img, cp.ndarray):
return cp.asnumpy(img)
return img
def _ensure_output_format(img):
"""Ensure output is in correct format based on GPU_PERSIST setting."""
return _to_cpu(img)
def prim_rotate(img, angle, cx=None, cy=None):
"""Rotate image by angle degrees around center (cx, cy)."""
if not GPU_AVAILABLE:
# Fallback to OpenCV
import cv2
h, w = img.shape[:2]
if cx is None:
cx = w / 2
if cy is None:
cy = h / 2
M = cv2.getRotationMatrix2D((cx, cy), angle, 1.0)
return cv2.warpAffine(img, M, (w, h))
img_gpu = _to_gpu(img)
h, w = img_gpu.shape[:2]
if cx is None:
cx = w / 2
if cy is None:
cy = h / 2
# Use cupyx.scipy.ndimage.rotate
# Note: scipy uses different angle convention
rotated = cpndimage.rotate(img_gpu, angle, reshape=False, order=1)
return _to_cpu(rotated)
def prim_scale(img, sx, sy, cx=None, cy=None):
"""Scale image by (sx, sy) around center (cx, cy)."""
if not GPU_AVAILABLE:
import cv2
h, w = img.shape[:2]
if cx is None:
cx = w / 2
if cy is None:
cy = h / 2
M = np.float32([
[sx, 0, cx * (1 - sx)],
[0, sy, cy * (1 - sy)]
])
return cv2.warpAffine(img, M, (w, h))
img_gpu = _to_gpu(img)
h, w = img_gpu.shape[:2]
if cx is None:
cx = w / 2
if cy is None:
cy = h / 2
# Use cupyx.scipy.ndimage.zoom
if img_gpu.ndim == 3:
zoom_factors = (sy, sx, 1) # Don't zoom color channels
else:
zoom_factors = (sy, sx)
zoomed = cpndimage.zoom(img_gpu, zoom_factors, order=1)
# Crop/pad to original size
zh, zw = zoomed.shape[:2]
result = cp.zeros_like(img_gpu)
# Calculate offsets
src_y = max(0, (zh - h) // 2)
src_x = max(0, (zw - w) // 2)
dst_y = max(0, (h - zh) // 2)
dst_x = max(0, (w - zw) // 2)
copy_h = min(h - dst_y, zh - src_y)
copy_w = min(w - dst_x, zw - src_x)
result[dst_y:dst_y+copy_h, dst_x:dst_x+copy_w] = zoomed[src_y:src_y+copy_h, src_x:src_x+copy_w]
return _to_cpu(result)
def prim_translate(img, dx, dy):
"""Translate image by (dx, dy) pixels."""
if not GPU_AVAILABLE:
import cv2
h, w = img.shape[:2]
M = np.float32([[1, 0, dx], [0, 1, dy]])
return cv2.warpAffine(img, M, (w, h))
img_gpu = _to_gpu(img)
# Use cupyx.scipy.ndimage.shift
if img_gpu.ndim == 3:
shift = (dy, dx, 0) # Don't shift color channels
else:
shift = (dy, dx)
shifted = cpndimage.shift(img_gpu, shift, order=1)
return _to_cpu(shifted)
def prim_flip_h(img):
"""Flip image horizontally."""
if GPU_AVAILABLE:
img_gpu = _to_gpu(img)
return _to_cpu(cp.flip(img_gpu, axis=1))
return np.flip(img, axis=1)
def prim_flip_v(img):
"""Flip image vertically."""
if GPU_AVAILABLE:
img_gpu = _to_gpu(img)
return _to_cpu(cp.flip(img_gpu, axis=0))
return np.flip(img, axis=0)
def prim_flip(img, direction="horizontal"):
"""Flip image in given direction."""
if direction in ("horizontal", "h"):
return prim_flip_h(img)
elif direction in ("vertical", "v"):
return prim_flip_v(img)
elif direction in ("both", "hv", "vh"):
if GPU_AVAILABLE:
img_gpu = _to_gpu(img)
return _to_cpu(cp.flip(cp.flip(img_gpu, axis=0), axis=1))
return np.flip(np.flip(img, axis=0), axis=1)
return img
# CUDA kernel for ripple effect
if GPU_AVAILABLE:
_ripple_kernel = cp.RawKernel(r'''
extern "C" __global__
void ripple(const unsigned char* src, unsigned char* dst,
int width, int height, int channels,
float amplitude, float frequency, float decay,
float speed, float time, float cx, float cy) {
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x >= width || y >= height) return;
// Distance from center
float dx = x - cx;
float dy = y - cy;
float dist = sqrtf(dx * dx + dy * dy);
// Ripple displacement
float wave = sinf(dist * frequency * 0.1f - time * speed) * amplitude;
float falloff = expf(-dist * decay * 0.01f);
float displacement = wave * falloff;
// Direction from center
float len = dist + 0.0001f; // Avoid division by zero
float dir_x = dx / len;
float dir_y = dy / len;
// Source coordinates
float src_x = x - dir_x * displacement;
float src_y = y - dir_y * displacement;
// Clamp to bounds
src_x = fmaxf(0.0f, fminf(width - 1.0f, src_x));
src_y = fmaxf(0.0f, fminf(height - 1.0f, src_y));
// Bilinear interpolation
int x0 = (int)src_x;
int y0 = (int)src_y;
int x1 = min(x0 + 1, width - 1);
int y1 = min(y0 + 1, height - 1);
float fx = src_x - x0;
float fy = src_y - y0;
for (int c = 0; c < channels; c++) {
float v00 = src[(y0 * width + x0) * channels + c];
float v10 = src[(y0 * width + x1) * channels + c];
float v01 = src[(y1 * width + x0) * channels + c];
float v11 = src[(y1 * width + x1) * channels + c];
float v0 = v00 * (1 - fx) + v10 * fx;
float v1 = v01 * (1 - fx) + v11 * fx;
float val = v0 * (1 - fy) + v1 * fy;
dst[(y * width + x) * channels + c] = (unsigned char)fminf(255.0f, fmaxf(0.0f, val));
}
}
''', 'ripple')
def prim_ripple(img, amplitude=10.0, frequency=8.0, decay=2.0, speed=5.0,
time=0.0, center_x=None, center_y=None):
"""Apply ripple distortion effect."""
h, w = img.shape[:2]
channels = img.shape[2] if img.ndim == 3 else 1
if center_x is None:
center_x = w / 2
if center_y is None:
center_y = h / 2
if not GPU_AVAILABLE:
# CPU fallback using coordinate mapping
import cv2
y_coords, x_coords = np.mgrid[0:h, 0:w].astype(np.float32)
dx = x_coords - center_x
dy = y_coords - center_y
dist = np.sqrt(dx**2 + dy**2)
wave = np.sin(dist * frequency * 0.1 - time * speed) * amplitude
falloff = np.exp(-dist * decay * 0.01)
displacement = wave * falloff
length = dist + 0.0001
dir_x = dx / length
dir_y = dy / length
map_x = (x_coords - dir_x * displacement).astype(np.float32)
map_y = (y_coords - dir_y * displacement).astype(np.float32)
return cv2.remap(img, map_x, map_y, cv2.INTER_LINEAR)
# GPU implementation
img_gpu = _to_gpu(img.astype(np.uint8))
if img_gpu.ndim == 2:
img_gpu = img_gpu[:, :, cp.newaxis]
channels = 1
dst = cp.zeros_like(img_gpu)
block = (16, 16)
grid = ((w + block[0] - 1) // block[0], (h + block[1] - 1) // block[1])
_ripple_kernel(grid, block, (
img_gpu, dst,
np.int32(w), np.int32(h), np.int32(channels),
np.float32(amplitude), np.float32(frequency), np.float32(decay),
np.float32(speed), np.float32(time),
np.float32(center_x), np.float32(center_y)
))
result = _to_cpu(dst)
if channels == 1:
result = result[:, :, 0]
return result
# CUDA kernel for fast rotation with bilinear interpolation
if GPU_AVAILABLE:
_rotate_kernel = cp.RawKernel(r'''
extern "C" __global__
void rotate_img(const unsigned char* src, unsigned char* dst,
int width, int height, int channels,
float cos_a, float sin_a, float cx, float cy) {
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x >= width || y >= height) return;
// Translate to center, rotate, translate back
float dx = x - cx;
float dy = y - cy;
float src_x = cos_a * dx + sin_a * dy + cx;
float src_y = -sin_a * dx + cos_a * dy + cy;
// Check bounds
if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) {
for (int c = 0; c < channels; c++) {
dst[(y * width + x) * channels + c] = 0;
}
return;
}
// Bilinear interpolation
int x0 = (int)src_x;
int y0 = (int)src_y;
int x1 = x0 + 1;
int y1 = y0 + 1;
float fx = src_x - x0;
float fy = src_y - y0;
for (int c = 0; c < channels; c++) {
float v00 = src[(y0 * width + x0) * channels + c];
float v10 = src[(y0 * width + x1) * channels + c];
float v01 = src[(y1 * width + x0) * channels + c];
float v11 = src[(y1 * width + x1) * channels + c];
float v0 = v00 * (1 - fx) + v10 * fx;
float v1 = v01 * (1 - fx) + v11 * fx;
float val = v0 * (1 - fy) + v1 * fy;
dst[(y * width + x) * channels + c] = (unsigned char)fminf(255.0f, fmaxf(0.0f, val));
}
}
''', 'rotate_img')
def prim_rotate_gpu(img, angle, cx=None, cy=None):
"""Fast GPU rotation using custom CUDA kernel."""
if not GPU_AVAILABLE:
return prim_rotate(img, angle, cx, cy)
h, w = img.shape[:2]
channels = img.shape[2] if img.ndim == 3 else 1
if cx is None:
cx = w / 2
if cy is None:
cy = h / 2
img_gpu = _to_gpu(img.astype(np.uint8))
if img_gpu.ndim == 2:
img_gpu = img_gpu[:, :, cp.newaxis]
channels = 1
dst = cp.zeros_like(img_gpu)
# Convert angle to radians
rad = np.radians(angle)
cos_a = np.cos(rad)
sin_a = np.sin(rad)
block = (16, 16)
grid = ((w + block[0] - 1) // block[0], (h + block[1] - 1) // block[1])
_rotate_kernel(grid, block, (
img_gpu, dst,
np.int32(w), np.int32(h), np.int32(channels),
np.float32(cos_a), np.float32(sin_a),
np.float32(cx), np.float32(cy)
))
result = _to_cpu(dst)
if channels == 1:
result = result[:, :, 0]
return result
# Import CPU primitives as fallbacks for functions we don't GPU-accelerate
def _get_cpu_primitives():
"""Get all primitives from CPU geometry module as fallbacks."""
from sexp_effects.primitive_libs import geometry
return geometry.PRIMITIVES
# Export functions - start with CPU primitives, then override with GPU versions
PRIMITIVES = _get_cpu_primitives().copy()
# Override specific primitives with GPU-accelerated versions
PRIMITIVES.update({
'translate': prim_translate,
'rotate-img': prim_rotate_gpu if GPU_AVAILABLE else prim_rotate,
'scale-img': prim_scale,
'flip-h': prim_flip_h,
'flip-v': prim_flip_v,
'flip': prim_flip,
# Note: ripple-displace uses CPU version (different API - returns coords, not image)
})