- Add IPFSHLSOutput class that uploads segments to IPFS as they're created - Update streaming task to use IPFS HLS output for distributed streaming - Add /ipfs-stream endpoint to get IPFS playlist URL - Update /stream endpoint to redirect to IPFS when available - Add GPU persistence mode (STREAMING_GPU_PERSIST=1) to keep frames on GPU - Add hardware video decoding (NVDEC) support for faster video processing - Add GPU-accelerated primitive libraries: blending_gpu, color_ops_gpu, geometry_gpu - Add streaming_gpu module with GPUFrame class for tracking CPU/GPU data location - Add Dockerfile.gpu for building GPU-enabled worker image Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
410 lines
12 KiB
Python
410 lines
12 KiB
Python
"""
|
|
GPU-Accelerated Geometry Primitives Library
|
|
|
|
Uses CuPy for CUDA-accelerated image transforms.
|
|
Falls back to CPU if GPU unavailable.
|
|
|
|
Performance Mode:
|
|
- Set STREAMING_GPU_PERSIST=1 to keep frames on GPU between operations
|
|
- This dramatically improves performance by avoiding CPU<->GPU transfers
|
|
- Frames only transfer to CPU at final output
|
|
"""
|
|
import os
|
|
import numpy as np
|
|
|
|
# Try to import CuPy for GPU acceleration
|
|
try:
|
|
import cupy as cp
|
|
from cupyx.scipy import ndimage as cpndimage
|
|
GPU_AVAILABLE = True
|
|
print("[geometry_gpu] CuPy GPU acceleration enabled")
|
|
except ImportError:
|
|
cp = np
|
|
GPU_AVAILABLE = False
|
|
print("[geometry_gpu] CuPy not available, using CPU fallback")
|
|
|
|
# GPU persistence mode - keep frames on GPU between operations
|
|
# Set STREAMING_GPU_PERSIST=1 for maximum performance
|
|
GPU_PERSIST = os.environ.get("STREAMING_GPU_PERSIST", "1") == "1"
|
|
if GPU_AVAILABLE and GPU_PERSIST:
|
|
print("[geometry_gpu] GPU persistence enabled - frames stay on GPU")
|
|
|
|
|
|
def _to_gpu(img):
|
|
"""Move image to GPU if available."""
|
|
if GPU_AVAILABLE and not isinstance(img, cp.ndarray):
|
|
return cp.asarray(img)
|
|
return img
|
|
|
|
|
|
def _to_cpu(img):
|
|
"""Move image back to CPU (only if GPU_PERSIST is disabled)."""
|
|
if not GPU_PERSIST and GPU_AVAILABLE and isinstance(img, cp.ndarray):
|
|
return cp.asnumpy(img)
|
|
return img
|
|
|
|
|
|
def _ensure_output_format(img):
|
|
"""Ensure output is in correct format based on GPU_PERSIST setting."""
|
|
return _to_cpu(img)
|
|
|
|
|
|
def prim_rotate(img, angle, cx=None, cy=None):
|
|
"""Rotate image by angle degrees around center (cx, cy)."""
|
|
if not GPU_AVAILABLE:
|
|
# Fallback to OpenCV
|
|
import cv2
|
|
h, w = img.shape[:2]
|
|
if cx is None:
|
|
cx = w / 2
|
|
if cy is None:
|
|
cy = h / 2
|
|
M = cv2.getRotationMatrix2D((cx, cy), angle, 1.0)
|
|
return cv2.warpAffine(img, M, (w, h))
|
|
|
|
img_gpu = _to_gpu(img)
|
|
h, w = img_gpu.shape[:2]
|
|
|
|
if cx is None:
|
|
cx = w / 2
|
|
if cy is None:
|
|
cy = h / 2
|
|
|
|
# Use cupyx.scipy.ndimage.rotate
|
|
# Note: scipy uses different angle convention
|
|
rotated = cpndimage.rotate(img_gpu, angle, reshape=False, order=1)
|
|
|
|
return _to_cpu(rotated)
|
|
|
|
|
|
def prim_scale(img, sx, sy, cx=None, cy=None):
|
|
"""Scale image by (sx, sy) around center (cx, cy)."""
|
|
if not GPU_AVAILABLE:
|
|
import cv2
|
|
h, w = img.shape[:2]
|
|
if cx is None:
|
|
cx = w / 2
|
|
if cy is None:
|
|
cy = h / 2
|
|
M = np.float32([
|
|
[sx, 0, cx * (1 - sx)],
|
|
[0, sy, cy * (1 - sy)]
|
|
])
|
|
return cv2.warpAffine(img, M, (w, h))
|
|
|
|
img_gpu = _to_gpu(img)
|
|
h, w = img_gpu.shape[:2]
|
|
|
|
if cx is None:
|
|
cx = w / 2
|
|
if cy is None:
|
|
cy = h / 2
|
|
|
|
# Use cupyx.scipy.ndimage.zoom
|
|
if img_gpu.ndim == 3:
|
|
zoom_factors = (sy, sx, 1) # Don't zoom color channels
|
|
else:
|
|
zoom_factors = (sy, sx)
|
|
|
|
zoomed = cpndimage.zoom(img_gpu, zoom_factors, order=1)
|
|
|
|
# Crop/pad to original size
|
|
zh, zw = zoomed.shape[:2]
|
|
result = cp.zeros_like(img_gpu)
|
|
|
|
# Calculate offsets
|
|
src_y = max(0, (zh - h) // 2)
|
|
src_x = max(0, (zw - w) // 2)
|
|
dst_y = max(0, (h - zh) // 2)
|
|
dst_x = max(0, (w - zw) // 2)
|
|
|
|
copy_h = min(h - dst_y, zh - src_y)
|
|
copy_w = min(w - dst_x, zw - src_x)
|
|
|
|
result[dst_y:dst_y+copy_h, dst_x:dst_x+copy_w] = zoomed[src_y:src_y+copy_h, src_x:src_x+copy_w]
|
|
|
|
return _to_cpu(result)
|
|
|
|
|
|
def prim_translate(img, dx, dy):
|
|
"""Translate image by (dx, dy) pixels."""
|
|
if not GPU_AVAILABLE:
|
|
import cv2
|
|
h, w = img.shape[:2]
|
|
M = np.float32([[1, 0, dx], [0, 1, dy]])
|
|
return cv2.warpAffine(img, M, (w, h))
|
|
|
|
img_gpu = _to_gpu(img)
|
|
# Use cupyx.scipy.ndimage.shift
|
|
if img_gpu.ndim == 3:
|
|
shift = (dy, dx, 0) # Don't shift color channels
|
|
else:
|
|
shift = (dy, dx)
|
|
|
|
shifted = cpndimage.shift(img_gpu, shift, order=1)
|
|
return _to_cpu(shifted)
|
|
|
|
|
|
def prim_flip_h(img):
|
|
"""Flip image horizontally."""
|
|
if GPU_AVAILABLE:
|
|
img_gpu = _to_gpu(img)
|
|
return _to_cpu(cp.flip(img_gpu, axis=1))
|
|
return np.flip(img, axis=1)
|
|
|
|
|
|
def prim_flip_v(img):
|
|
"""Flip image vertically."""
|
|
if GPU_AVAILABLE:
|
|
img_gpu = _to_gpu(img)
|
|
return _to_cpu(cp.flip(img_gpu, axis=0))
|
|
return np.flip(img, axis=0)
|
|
|
|
|
|
def prim_flip(img, direction="horizontal"):
|
|
"""Flip image in given direction."""
|
|
if direction in ("horizontal", "h"):
|
|
return prim_flip_h(img)
|
|
elif direction in ("vertical", "v"):
|
|
return prim_flip_v(img)
|
|
elif direction in ("both", "hv", "vh"):
|
|
if GPU_AVAILABLE:
|
|
img_gpu = _to_gpu(img)
|
|
return _to_cpu(cp.flip(cp.flip(img_gpu, axis=0), axis=1))
|
|
return np.flip(np.flip(img, axis=0), axis=1)
|
|
return img
|
|
|
|
|
|
# CUDA kernel for ripple effect
|
|
if GPU_AVAILABLE:
|
|
_ripple_kernel = cp.RawKernel(r'''
|
|
extern "C" __global__
|
|
void ripple(const unsigned char* src, unsigned char* dst,
|
|
int width, int height, int channels,
|
|
float amplitude, float frequency, float decay,
|
|
float speed, float time, float cx, float cy) {
|
|
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
|
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
|
|
|
if (x >= width || y >= height) return;
|
|
|
|
// Distance from center
|
|
float dx = x - cx;
|
|
float dy = y - cy;
|
|
float dist = sqrtf(dx * dx + dy * dy);
|
|
|
|
// Ripple displacement
|
|
float wave = sinf(dist * frequency * 0.1f - time * speed) * amplitude;
|
|
float falloff = expf(-dist * decay * 0.01f);
|
|
float displacement = wave * falloff;
|
|
|
|
// Direction from center
|
|
float len = dist + 0.0001f; // Avoid division by zero
|
|
float dir_x = dx / len;
|
|
float dir_y = dy / len;
|
|
|
|
// Source coordinates
|
|
float src_x = x - dir_x * displacement;
|
|
float src_y = y - dir_y * displacement;
|
|
|
|
// Clamp to bounds
|
|
src_x = fmaxf(0.0f, fminf(width - 1.0f, src_x));
|
|
src_y = fmaxf(0.0f, fminf(height - 1.0f, src_y));
|
|
|
|
// Bilinear interpolation
|
|
int x0 = (int)src_x;
|
|
int y0 = (int)src_y;
|
|
int x1 = min(x0 + 1, width - 1);
|
|
int y1 = min(y0 + 1, height - 1);
|
|
|
|
float fx = src_x - x0;
|
|
float fy = src_y - y0;
|
|
|
|
for (int c = 0; c < channels; c++) {
|
|
float v00 = src[(y0 * width + x0) * channels + c];
|
|
float v10 = src[(y0 * width + x1) * channels + c];
|
|
float v01 = src[(y1 * width + x0) * channels + c];
|
|
float v11 = src[(y1 * width + x1) * channels + c];
|
|
|
|
float v0 = v00 * (1 - fx) + v10 * fx;
|
|
float v1 = v01 * (1 - fx) + v11 * fx;
|
|
float val = v0 * (1 - fy) + v1 * fy;
|
|
|
|
dst[(y * width + x) * channels + c] = (unsigned char)fminf(255.0f, fmaxf(0.0f, val));
|
|
}
|
|
}
|
|
''', 'ripple')
|
|
|
|
|
|
def prim_ripple(img, amplitude=10.0, frequency=8.0, decay=2.0, speed=5.0,
|
|
time=0.0, center_x=None, center_y=None):
|
|
"""Apply ripple distortion effect."""
|
|
h, w = img.shape[:2]
|
|
channels = img.shape[2] if img.ndim == 3 else 1
|
|
|
|
if center_x is None:
|
|
center_x = w / 2
|
|
if center_y is None:
|
|
center_y = h / 2
|
|
|
|
if not GPU_AVAILABLE:
|
|
# CPU fallback using coordinate mapping
|
|
import cv2
|
|
y_coords, x_coords = np.mgrid[0:h, 0:w].astype(np.float32)
|
|
|
|
dx = x_coords - center_x
|
|
dy = y_coords - center_y
|
|
dist = np.sqrt(dx**2 + dy**2)
|
|
|
|
wave = np.sin(dist * frequency * 0.1 - time * speed) * amplitude
|
|
falloff = np.exp(-dist * decay * 0.01)
|
|
displacement = wave * falloff
|
|
|
|
length = dist + 0.0001
|
|
dir_x = dx / length
|
|
dir_y = dy / length
|
|
|
|
map_x = (x_coords - dir_x * displacement).astype(np.float32)
|
|
map_y = (y_coords - dir_y * displacement).astype(np.float32)
|
|
|
|
return cv2.remap(img, map_x, map_y, cv2.INTER_LINEAR)
|
|
|
|
# GPU implementation
|
|
img_gpu = _to_gpu(img.astype(np.uint8))
|
|
if img_gpu.ndim == 2:
|
|
img_gpu = img_gpu[:, :, cp.newaxis]
|
|
channels = 1
|
|
|
|
dst = cp.zeros_like(img_gpu)
|
|
|
|
block = (16, 16)
|
|
grid = ((w + block[0] - 1) // block[0], (h + block[1] - 1) // block[1])
|
|
|
|
_ripple_kernel(grid, block, (
|
|
img_gpu, dst,
|
|
np.int32(w), np.int32(h), np.int32(channels),
|
|
np.float32(amplitude), np.float32(frequency), np.float32(decay),
|
|
np.float32(speed), np.float32(time),
|
|
np.float32(center_x), np.float32(center_y)
|
|
))
|
|
|
|
result = _to_cpu(dst)
|
|
if channels == 1:
|
|
result = result[:, :, 0]
|
|
return result
|
|
|
|
|
|
# CUDA kernel for fast rotation with bilinear interpolation
|
|
if GPU_AVAILABLE:
|
|
_rotate_kernel = cp.RawKernel(r'''
|
|
extern "C" __global__
|
|
void rotate_img(const unsigned char* src, unsigned char* dst,
|
|
int width, int height, int channels,
|
|
float cos_a, float sin_a, float cx, float cy) {
|
|
int x = blockDim.x * blockIdx.x + threadIdx.x;
|
|
int y = blockDim.y * blockIdx.y + threadIdx.y;
|
|
|
|
if (x >= width || y >= height) return;
|
|
|
|
// Translate to center, rotate, translate back
|
|
float dx = x - cx;
|
|
float dy = y - cy;
|
|
|
|
float src_x = cos_a * dx + sin_a * dy + cx;
|
|
float src_y = -sin_a * dx + cos_a * dy + cy;
|
|
|
|
// Check bounds
|
|
if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) {
|
|
for (int c = 0; c < channels; c++) {
|
|
dst[(y * width + x) * channels + c] = 0;
|
|
}
|
|
return;
|
|
}
|
|
|
|
// Bilinear interpolation
|
|
int x0 = (int)src_x;
|
|
int y0 = (int)src_y;
|
|
int x1 = x0 + 1;
|
|
int y1 = y0 + 1;
|
|
|
|
float fx = src_x - x0;
|
|
float fy = src_y - y0;
|
|
|
|
for (int c = 0; c < channels; c++) {
|
|
float v00 = src[(y0 * width + x0) * channels + c];
|
|
float v10 = src[(y0 * width + x1) * channels + c];
|
|
float v01 = src[(y1 * width + x0) * channels + c];
|
|
float v11 = src[(y1 * width + x1) * channels + c];
|
|
|
|
float v0 = v00 * (1 - fx) + v10 * fx;
|
|
float v1 = v01 * (1 - fx) + v11 * fx;
|
|
float val = v0 * (1 - fy) + v1 * fy;
|
|
|
|
dst[(y * width + x) * channels + c] = (unsigned char)fminf(255.0f, fmaxf(0.0f, val));
|
|
}
|
|
}
|
|
''', 'rotate_img')
|
|
|
|
|
|
def prim_rotate_gpu(img, angle, cx=None, cy=None):
|
|
"""Fast GPU rotation using custom CUDA kernel."""
|
|
if not GPU_AVAILABLE:
|
|
return prim_rotate(img, angle, cx, cy)
|
|
|
|
h, w = img.shape[:2]
|
|
channels = img.shape[2] if img.ndim == 3 else 1
|
|
|
|
if cx is None:
|
|
cx = w / 2
|
|
if cy is None:
|
|
cy = h / 2
|
|
|
|
img_gpu = _to_gpu(img.astype(np.uint8))
|
|
if img_gpu.ndim == 2:
|
|
img_gpu = img_gpu[:, :, cp.newaxis]
|
|
channels = 1
|
|
|
|
dst = cp.zeros_like(img_gpu)
|
|
|
|
# Convert angle to radians
|
|
rad = np.radians(angle)
|
|
cos_a = np.cos(rad)
|
|
sin_a = np.sin(rad)
|
|
|
|
block = (16, 16)
|
|
grid = ((w + block[0] - 1) // block[0], (h + block[1] - 1) // block[1])
|
|
|
|
_rotate_kernel(grid, block, (
|
|
img_gpu, dst,
|
|
np.int32(w), np.int32(h), np.int32(channels),
|
|
np.float32(cos_a), np.float32(sin_a),
|
|
np.float32(cx), np.float32(cy)
|
|
))
|
|
|
|
result = _to_cpu(dst)
|
|
if channels == 1:
|
|
result = result[:, :, 0]
|
|
return result
|
|
|
|
|
|
# Import CPU primitives as fallbacks for functions we don't GPU-accelerate
|
|
def _get_cpu_primitives():
|
|
"""Get all primitives from CPU geometry module as fallbacks."""
|
|
from sexp_effects.primitive_libs import geometry
|
|
return geometry.PRIMITIVES
|
|
|
|
|
|
# Export functions - start with CPU primitives, then override with GPU versions
|
|
PRIMITIVES = _get_cpu_primitives().copy()
|
|
|
|
# Override specific primitives with GPU-accelerated versions
|
|
PRIMITIVES.update({
|
|
'translate': prim_translate,
|
|
'rotate-img': prim_rotate_gpu if GPU_AVAILABLE else prim_rotate,
|
|
'scale-img': prim_scale,
|
|
'flip-h': prim_flip_h,
|
|
'flip-v': prim_flip_v,
|
|
'flip': prim_flip,
|
|
# Note: ripple-displace uses CPU version (different API - returns coords, not image)
|
|
})
|