Integrate fast CUDA kernels for GPU effects pipeline

Replace slow scipy.ndimage operations with custom CUDA kernels: - gpu_rotate: AFFINE_WARP_KERNEL (< 1ms vs 20ms for scipy) - gpu_blend: BLEND_KERNEL for fast alpha blending - gpu_brightness/contrast: BRIGHTNESS_CONTRAST_KERNEL - Add gpu_zoom, gpu_hue_shift, gpu_invert, gpu_ripple Preserve GPU arrays through pipeline: - Updated _maybe_to_numpy() to keep CuPy arrays for GPU primitives - Primitives detect CuPy arrays via __cuda_array_interface__ - No unnecessary CPU round-trips between operations New jit_compiler.py contains all CUDA kernels with FastGPUOps class using ping-pong buffer strategy for efficient in-place ops. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-04 02:53:46 +00:00
parent 75f9d8fb11
commit ad1d7893f8
4 changed files with 794 additions and 39 deletions
--- a/streaming/stream_sexp_generic.py
+++ b/streaming/stream_sexp_generic.py
@@ -105,10 +105,27 @@ class StreamInterpreter:
        self.errors.append(msg)
        print(f"ERROR: {msg}", file=sys.stderr)

-    def _maybe_to_numpy(self, val):
-        """Convert GPU frames/CuPy arrays to numpy for CPU primitives."""
+    def _maybe_to_numpy(self, val, for_gpu_primitive: bool = False):
+        """Convert GPU frames/CuPy arrays to numpy for CPU primitives.
+
+        If for_gpu_primitive=True, preserve GPU data (CuPy arrays stay on GPU).
+        """
        if val is None:
            return val
+
+        # For GPU primitives, keep data on GPU
+        if for_gpu_primitive:
+            # Handle GPUFrame - return the GPU array
+            if hasattr(val, 'gpu') and hasattr(val, 'is_on_gpu'):
+                if val.is_on_gpu:
+                    return val.gpu
+                return val.cpu
+            # CuPy arrays pass through unchanged
+            if hasattr(val, '__cuda_array_interface__'):
+                return val
+            return val
+
+        # For CPU primitives, convert to numpy
        # Handle GPUFrame objects (have .cpu property)
        if hasattr(val, 'cpu'):
            return val.cpu
@@ -778,6 +795,8 @@ class StreamInterpreter:

        if op in self.primitives:
            prim_func = self.primitives[op]
+            # Check if this is a GPU primitive (preserves GPU arrays)
+            is_gpu_prim = op.startswith('gpu:') or 'gpu' in op.lower()
            evaluated_args = []
            kwargs = {}
            i = 0
@@ -785,10 +804,10 @@ class StreamInterpreter:
                if isinstance(args[i], Keyword):
                    k = args[i].name
                    v = self._eval(args[i + 1], env) if i + 1 < len(args) else None
-                    kwargs[k] = self._maybe_to_numpy(v)
+                    kwargs[k] = self._maybe_to_numpy(v, for_gpu_primitive=is_gpu_prim)
                    i += 2
                else:
-                    evaluated_args.append(self._maybe_to_numpy(self._eval(args[i], env)))
+                    evaluated_args.append(self._maybe_to_numpy(self._eval(args[i], env), for_gpu_primitive=is_gpu_prim))
                    i += 1
            try:
                if kwargs:
@@ -812,6 +831,8 @@ class StreamInterpreter:
        prim_name = op.replace('-', '_')
        if prim_name in self.primitives:
            prim_func = self.primitives[prim_name]
+            # Check if this is a GPU primitive (preserves GPU arrays)
+            is_gpu_prim = 'gpu' in prim_name.lower()
            evaluated_args = []
            kwargs = {}
            i = 0
@@ -819,10 +840,10 @@ class StreamInterpreter:
                if isinstance(args[i], Keyword):
                    k = args[i].name.replace('-', '_')
                    v = self._eval(args[i + 1], env) if i + 1 < len(args) else None
-                    kwargs[k] = v
+                    kwargs[k] = self._maybe_to_numpy(v, for_gpu_primitive=is_gpu_prim)
                    i += 2
                else:
-                    evaluated_args.append(self._eval(args[i], env))
+                    evaluated_args.append(self._maybe_to_numpy(self._eval(args[i], env), for_gpu_primitive=is_gpu_prim))
                    i += 1

            try: