Add autonomous-pipeline primitive for zero-Python hot path

2026-02-04 10:02:40 +00:00
parent 6e20d19a23
commit e4349ba501
2 changed files with 119 additions and 0 deletions
--- a/sexp_effects/primitive_libs/streaming_gpu.py
+++ b/sexp_effects/primitive_libs/streaming_gpu.py
@@ -845,9 +845,11 @@ PRIMITIVES = _get_cpu_primitives().copy()
 # Try to import fused kernel compiler
 _FUSED_KERNELS_AVAILABLE = False
 _compile_frame_pipeline = None
+_compile_autonomous_pipeline = None
 try:
    if GPU_AVAILABLE:
        from streaming.sexp_to_cuda import compile_frame_pipeline as _compile_frame_pipeline
+        from streaming.sexp_to_cuda import compile_autonomous_pipeline as _compile_autonomous_pipeline
        _FUSED_KERNELS_AVAILABLE = True
        print("[streaming_gpu] Fused CUDA kernel compiler loaded", file=sys.stderr)
 except ImportError as e:
@@ -953,6 +955,87 @@ def prim_fused_pipeline(img, effects_list, **dynamic_params):
    return pipeline(gpu_img, **dynamic_params)


+# Autonomous pipeline cache (separate from fused)
+_AUTONOMOUS_PIPELINE_CACHE = {}
+
+
+def prim_autonomous_pipeline(img, effects_list, dynamic_expressions, frame_num, fps=30.0):
+    """
+    Apply a fully autonomous CUDA kernel pipeline.
+
+    This computes ALL parameters on GPU - including time-based expressions
+    like sin(t), t*30, etc. Zero Python in the hot path!
+
+    Args:
+        img: Input image (GPU array or numpy array)
+        effects_list: List of effect dicts
+        dynamic_expressions: Dict mapping param names to CUDA expressions:
+            {'rotate_angle': 't * 30.0f',
+             'ripple_phase': 't * 2.0f',
+             'brightness_factor': '0.8f + 0.4f * sinf(t * 2.0f)'}
+        frame_num: Current frame number
+        fps: Frames per second (default 30)
+
+    Returns:
+        Processed image as GPU array
+
+    Note: Expressions use CUDA syntax - use sinf() not sin(), etc.
+    """
+    # Normalize effects and expressions
+    effects_list = [_normalize_effect_dict(e) for e in effects_list]
+    dynamic_expressions = {
+        (k.name if hasattr(k, 'name') else str(k)): v
+        for k, v in dynamic_expressions.items()
+    }
+
+    if not _FUSED_KERNELS_AVAILABLE or _compile_autonomous_pipeline is None:
+        # Fallback to regular fused pipeline with Python-computed params
+        import math
+        t = float(frame_num) / float(fps)
+        # Evaluate expressions in Python as fallback
+        dynamic_params = {}
+        for key, expr in dynamic_expressions.items():
+            try:
+                # Simple eval with t and math functions
+                result = eval(expr.replace('f', '').replace('sin', 'math.sin').replace('cos', 'math.cos'),
+                             {'t': t, 'math': math, 'frame_num': frame_num})
+                dynamic_params[key] = result
+            except:
+                dynamic_params[key] = 0
+        return prim_fused_pipeline(img, effects_list, **dynamic_params)
+
+    # Get image dimensions
+    if hasattr(img, 'shape'):
+        h, w = img.shape[:2]
+    else:
+        raise ValueError("Image must have shape attribute")
+
+    # Create cache key
+    import hashlib
+    ops_key = str([(e['op'], {k:v for k,v in e.items() if k != 'src2'}) for e in effects_list])
+    expr_key = str(sorted(dynamic_expressions.items()))
+    cache_key = f"auto_{w}x{h}_{hashlib.md5((ops_key + expr_key).encode()).hexdigest()}"
+
+    # Compile or get cached pipeline
+    if cache_key not in _AUTONOMOUS_PIPELINE_CACHE:
+        _AUTONOMOUS_PIPELINE_CACHE[cache_key] = _compile_autonomous_pipeline(
+            effects_list, w, h, dynamic_expressions)
+
+    pipeline = _AUTONOMOUS_PIPELINE_CACHE[cache_key]
+
+    # Ensure image is on GPU
+    if hasattr(img, '__cuda_array_interface__'):
+        gpu_img = img
+    elif GPU_AVAILABLE:
+        gpu_img = cp.asarray(img)
+    else:
+        gpu_img = img
+
+    # Run - just pass frame_num and fps, kernel does the rest!
+    return pipeline(gpu_img, int(frame_num), float(fps))
+
+
 # Add GPU-specific primitives
 PRIMITIVES['fused-pipeline'] = prim_fused_pipeline
+PRIMITIVES['autonomous-pipeline'] = prim_autonomous_pipeline
 # (The GPU video source will be added by create_cid_primitives in the task)