Integrate fast CUDA kernels for GPU effects pipeline
Some checks are pending
GPU Worker CI/CD / test (push) Waiting to run
GPU Worker CI/CD / deploy (push) Blocked by required conditions

Replace slow scipy.ndimage operations with custom CUDA kernels:
- gpu_rotate: AFFINE_WARP_KERNEL (< 1ms vs 20ms for scipy)
- gpu_blend: BLEND_KERNEL for fast alpha blending
- gpu_brightness/contrast: BRIGHTNESS_CONTRAST_KERNEL
- Add gpu_zoom, gpu_hue_shift, gpu_invert, gpu_ripple

Preserve GPU arrays through pipeline:
- Updated _maybe_to_numpy() to keep CuPy arrays for GPU primitives
- Primitives detect CuPy arrays via __cuda_array_interface__
- No unnecessary CPU round-trips between operations

New jit_compiler.py contains all CUDA kernels with FastGPUOps
class using ping-pong buffer strategy for efficient in-place ops.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
giles
2026-02-04 02:53:46 +00:00
parent 75f9d8fb11
commit ad1d7893f8
4 changed files with 794 additions and 39 deletions

View File

@@ -105,10 +105,27 @@ class StreamInterpreter:
self.errors.append(msg)
print(f"ERROR: {msg}", file=sys.stderr)
def _maybe_to_numpy(self, val):
"""Convert GPU frames/CuPy arrays to numpy for CPU primitives."""
def _maybe_to_numpy(self, val, for_gpu_primitive: bool = False):
"""Convert GPU frames/CuPy arrays to numpy for CPU primitives.
If for_gpu_primitive=True, preserve GPU data (CuPy arrays stay on GPU).
"""
if val is None:
return val
# For GPU primitives, keep data on GPU
if for_gpu_primitive:
# Handle GPUFrame - return the GPU array
if hasattr(val, 'gpu') and hasattr(val, 'is_on_gpu'):
if val.is_on_gpu:
return val.gpu
return val.cpu
# CuPy arrays pass through unchanged
if hasattr(val, '__cuda_array_interface__'):
return val
return val
# For CPU primitives, convert to numpy
# Handle GPUFrame objects (have .cpu property)
if hasattr(val, 'cpu'):
return val.cpu
@@ -778,6 +795,8 @@ class StreamInterpreter:
if op in self.primitives:
prim_func = self.primitives[op]
# Check if this is a GPU primitive (preserves GPU arrays)
is_gpu_prim = op.startswith('gpu:') or 'gpu' in op.lower()
evaluated_args = []
kwargs = {}
i = 0
@@ -785,10 +804,10 @@ class StreamInterpreter:
if isinstance(args[i], Keyword):
k = args[i].name
v = self._eval(args[i + 1], env) if i + 1 < len(args) else None
kwargs[k] = self._maybe_to_numpy(v)
kwargs[k] = self._maybe_to_numpy(v, for_gpu_primitive=is_gpu_prim)
i += 2
else:
evaluated_args.append(self._maybe_to_numpy(self._eval(args[i], env)))
evaluated_args.append(self._maybe_to_numpy(self._eval(args[i], env), for_gpu_primitive=is_gpu_prim))
i += 1
try:
if kwargs:
@@ -812,6 +831,8 @@ class StreamInterpreter:
prim_name = op.replace('-', '_')
if prim_name in self.primitives:
prim_func = self.primitives[prim_name]
# Check if this is a GPU primitive (preserves GPU arrays)
is_gpu_prim = 'gpu' in prim_name.lower()
evaluated_args = []
kwargs = {}
i = 0
@@ -819,10 +840,10 @@ class StreamInterpreter:
if isinstance(args[i], Keyword):
k = args[i].name.replace('-', '_')
v = self._eval(args[i + 1], env) if i + 1 < len(args) else None
kwargs[k] = v
kwargs[k] = self._maybe_to_numpy(v, for_gpu_primitive=is_gpu_prim)
i += 2
else:
evaluated_args.append(self._eval(args[i], env))
evaluated_args.append(self._maybe_to_numpy(self._eval(args[i], env), for_gpu_primitive=is_gpu_prim))
i += 1
try: