Integrate fast CUDA kernels for GPU effects pipeline
Replace slow scipy.ndimage operations with custom CUDA kernels: - gpu_rotate: AFFINE_WARP_KERNEL (< 1ms vs 20ms for scipy) - gpu_blend: BLEND_KERNEL for fast alpha blending - gpu_brightness/contrast: BRIGHTNESS_CONTRAST_KERNEL - Add gpu_zoom, gpu_hue_shift, gpu_invert, gpu_ripple Preserve GPU arrays through pipeline: - Updated _maybe_to_numpy() to keep CuPy arrays for GPU primitives - Primitives detect CuPy arrays via __cuda_array_interface__ - No unnecessary CPU round-trips between operations New jit_compiler.py contains all CUDA kernels with FastGPUOps class using ping-pong buffer strategy for efficient in-place ops. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -105,10 +105,27 @@ class StreamInterpreter:
|
||||
self.errors.append(msg)
|
||||
print(f"ERROR: {msg}", file=sys.stderr)
|
||||
|
||||
def _maybe_to_numpy(self, val):
|
||||
"""Convert GPU frames/CuPy arrays to numpy for CPU primitives."""
|
||||
def _maybe_to_numpy(self, val, for_gpu_primitive: bool = False):
|
||||
"""Convert GPU frames/CuPy arrays to numpy for CPU primitives.
|
||||
|
||||
If for_gpu_primitive=True, preserve GPU data (CuPy arrays stay on GPU).
|
||||
"""
|
||||
if val is None:
|
||||
return val
|
||||
|
||||
# For GPU primitives, keep data on GPU
|
||||
if for_gpu_primitive:
|
||||
# Handle GPUFrame - return the GPU array
|
||||
if hasattr(val, 'gpu') and hasattr(val, 'is_on_gpu'):
|
||||
if val.is_on_gpu:
|
||||
return val.gpu
|
||||
return val.cpu
|
||||
# CuPy arrays pass through unchanged
|
||||
if hasattr(val, '__cuda_array_interface__'):
|
||||
return val
|
||||
return val
|
||||
|
||||
# For CPU primitives, convert to numpy
|
||||
# Handle GPUFrame objects (have .cpu property)
|
||||
if hasattr(val, 'cpu'):
|
||||
return val.cpu
|
||||
@@ -778,6 +795,8 @@ class StreamInterpreter:
|
||||
|
||||
if op in self.primitives:
|
||||
prim_func = self.primitives[op]
|
||||
# Check if this is a GPU primitive (preserves GPU arrays)
|
||||
is_gpu_prim = op.startswith('gpu:') or 'gpu' in op.lower()
|
||||
evaluated_args = []
|
||||
kwargs = {}
|
||||
i = 0
|
||||
@@ -785,10 +804,10 @@ class StreamInterpreter:
|
||||
if isinstance(args[i], Keyword):
|
||||
k = args[i].name
|
||||
v = self._eval(args[i + 1], env) if i + 1 < len(args) else None
|
||||
kwargs[k] = self._maybe_to_numpy(v)
|
||||
kwargs[k] = self._maybe_to_numpy(v, for_gpu_primitive=is_gpu_prim)
|
||||
i += 2
|
||||
else:
|
||||
evaluated_args.append(self._maybe_to_numpy(self._eval(args[i], env)))
|
||||
evaluated_args.append(self._maybe_to_numpy(self._eval(args[i], env), for_gpu_primitive=is_gpu_prim))
|
||||
i += 1
|
||||
try:
|
||||
if kwargs:
|
||||
@@ -812,6 +831,8 @@ class StreamInterpreter:
|
||||
prim_name = op.replace('-', '_')
|
||||
if prim_name in self.primitives:
|
||||
prim_func = self.primitives[prim_name]
|
||||
# Check if this is a GPU primitive (preserves GPU arrays)
|
||||
is_gpu_prim = 'gpu' in prim_name.lower()
|
||||
evaluated_args = []
|
||||
kwargs = {}
|
||||
i = 0
|
||||
@@ -819,10 +840,10 @@ class StreamInterpreter:
|
||||
if isinstance(args[i], Keyword):
|
||||
k = args[i].name.replace('-', '_')
|
||||
v = self._eval(args[i + 1], env) if i + 1 < len(args) else None
|
||||
kwargs[k] = v
|
||||
kwargs[k] = self._maybe_to_numpy(v, for_gpu_primitive=is_gpu_prim)
|
||||
i += 2
|
||||
else:
|
||||
evaluated_args.append(self._eval(args[i], env))
|
||||
evaluated_args.append(self._maybe_to_numpy(self._eval(args[i], env), for_gpu_primitive=is_gpu_prim))
|
||||
i += 1
|
||||
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user