diff --git a/streaming/sexp_to_cuda.py b/streaming/sexp_to_cuda.py
index cb23058..9212b7c 100644
--- a/streaming/sexp_to_cuda.py
+++ b/streaming/sexp_to_cuda.py
@@ -316,35 +316,350 @@ def _build_params(effects: List[dict], dynamic_params: dict) -> cp.ndarray:
     return cp.array(params, dtype=cp.float32)
 
 
+def compile_autonomous_pipeline(effects: List[dict], width: int, height: int,
+                                 dynamic_expressions: dict = None) -> callable:
+    """
+    Compile a fully autonomous pipeline that computes ALL parameters on GPU.
+
+    This eliminates Python from the hot path - the kernel computes time-based
+    parameters (sin, cos, etc.) directly on GPU.
+
+    Args:
+        effects: List of effect dicts
+        width, height: Frame dimensions
+        dynamic_expressions: Dict mapping param names to expressions, e.g.:
+            {'rotate_angle': 't * 30',
+             'ripple_phase': 't * 2',
+             'brightness_factor': '0.8 + 0.4 * sin(t * 2)'}
+
+    Returns:
+        Callable that takes (frame, frame_num, fps) and returns output frame
+    """
+    if dynamic_expressions is None:
+        dynamic_expressions = {}
+
+    # Generate cache key
+    ops_key = str([(e['op'], {k:v for k,v in e.items() if k != 'src2'}) for e in effects])
+    expr_key = str(sorted(dynamic_expressions.items()))
+    cache_key = f"auto_{width}x{height}_{hashlib.md5((ops_key + expr_key).encode()).hexdigest()}"
+
+    if cache_key in _COMPILED_KERNELS:
+        return _COMPILED_KERNELS[cache_key]
+
+    # Generate autonomous kernel code
+    kernel_code = _generate_autonomous_kernel(effects, width, height, dynamic_expressions)
+
+    # Compile kernel
+    kernel = cp.RawKernel(kernel_code, 'autonomous_pipeline')
+
+    # Create wrapper function
+    def run_autonomous(frame: cp.ndarray, frame_num: int, fps: float = 30.0) -> cp.ndarray:
+        """Run the autonomous pipeline - no Python in the hot path!"""
+        if frame.dtype != cp.uint8:
+            frame = cp.clip(frame, 0, 255).astype(cp.uint8)
+        if not frame.flags['C_CONTIGUOUS']:
+            frame = cp.ascontiguousarray(frame)
+
+        output = cp.zeros_like(frame)
+
+        block = (16, 16)
+        grid = ((width + 15) // 16, (height + 15) // 16)
+
+        # Only pass frame_num and fps - kernel computes everything else!
+        t = float(frame_num) / float(fps)
+        kernel(grid, block, (frame, output, np.int32(width), np.int32(height),
+                            np.float32(t), np.int32(frame_num)))
+
+        return output
+
+    _COMPILED_KERNELS[cache_key] = run_autonomous
+    return run_autonomous
+
+
+def _generate_autonomous_kernel(effects: List[dict], width: int, height: int,
+                                 dynamic_expressions: dict) -> str:
+    """Generate CUDA kernel that computes everything autonomously."""
+
+    # Map simple expressions to CUDA code
+    def expr_to_cuda(expr: str) -> str:
+        """Convert simple expression to CUDA."""
+        expr = expr.replace('sin(', 'sinf(')
+        expr = expr.replace('cos(', 'cosf(')
+        expr = expr.replace('abs(', 'fabsf(')
+        return expr
+
+    code = r'''
+extern "C" __global__
+void autonomous_pipeline(
+    const unsigned char* src,
+    unsigned char* dst,
+    int width, int height,
+    float t, int frame_num
+) {
+    int x = blockIdx.x * blockDim.x + threadIdx.x;
+    int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (x >= width || y >= height) return;
+
+    // Compute dynamic parameters from time (ALL ON GPU!)
+'''
+
+    # Add dynamic parameter calculations
+    rotate_expr = dynamic_expressions.get('rotate_angle', '0.0f')
+    ripple_phase_expr = dynamic_expressions.get('ripple_phase', '0.0f')
+    brightness_expr = dynamic_expressions.get('brightness_factor', '1.0f')
+    zoom_expr = dynamic_expressions.get('zoom_amount', '1.0f')
+
+    code += f'''
+    float rotate_angle = {expr_to_cuda(rotate_expr)};
+    float ripple_phase = {expr_to_cuda(ripple_phase_expr)};
+    float brightness_factor = {expr_to_cuda(brightness_expr)};
+    float zoom_amount = {expr_to_cuda(zoom_expr)};
+
+    // Start with source coordinates
+    float src_x = (float)x;
+    float src_y = (float)y;
+    float cx = width / 2.0f;
+    float cy = height / 2.0f;
+
+    // Accumulated transforms
+    float total_cos = 1.0f, total_sin = 0.0f;
+    float total_zoom = 1.0f;
+    float ripple_dx = 0.0f, ripple_dy = 0.0f;
+
+'''
+
+    # Add effect-specific code
+    for i, effect in enumerate(effects):
+        op = effect['op']
+
+        if op == 'rotate':
+            code += f'''
+    // Rotate {i}
+    {{
+        float angle = rotate_angle * 3.14159265f / 180.0f;
+        float c = cosf(angle);
+        float s = sinf(angle);
+        float nc = total_cos * c - total_sin * s;
+        float ns = total_cos * s + total_sin * c;
+        total_cos = nc;
+        total_sin = ns;
+    }}
+'''
+        elif op == 'zoom':
+            code += f'''
+    // Zoom {i}
+    {{
+        total_zoom *= zoom_amount;
+    }}
+'''
+        elif op == 'ripple':
+            amp = effect.get('amplitude', 10)
+            freq = effect.get('frequency', 8)
+            decay = effect.get('decay', 2)
+            rcx = effect.get('center_x', width/2)
+            rcy = effect.get('center_y', height/2)
+            code += f'''
+    // Ripple {i}
+    {{
+        float amplitude = {amp}f;
+        float frequency = {freq}f;
+        float decay_val = {decay}f;
+        float rcx = {rcx}f;
+        float rcy = {rcy}f;
+
+        float rdx = src_x - rcx;
+        float rdy = src_y - rcy;
+        float dist = sqrtf(rdx * rdx + rdy * rdy);
+
+        float wave = sinf(dist * frequency * 0.1f + ripple_phase);
+        float amp = amplitude * expf(-dist * decay_val * 0.01f);
+
+        if (dist > 0.001f) {{
+            ripple_dx += rdx / dist * wave * amp;
+            ripple_dy += rdy / dist * wave * amp;
+        }}
+    }}
+'''
+
+    # Apply geometric transforms
+    code += '''
+    // Apply accumulated transforms
+    {
+        float dx = src_x - cx;
+        float dy = src_y - cy;
+        float rx = total_cos * dx + total_sin * dy;
+        float ry = -total_sin * dx + total_cos * dy;
+        rx /= total_zoom;
+        ry /= total_zoom;
+        src_x = rx + cx - ripple_dx;
+        src_y = ry + cy - ripple_dy;
+    }
+
+    // Bilinear sample
+    float r, g, b;
+    if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) {
+        r = g = b = 0;
+    } else {
+        int x0 = (int)src_x;
+        int y0 = (int)src_y;
+        float fx = src_x - x0;
+        float fy = src_y - y0;
+
+        int idx00 = (y0 * width + x0) * 3;
+        int idx10 = (y0 * width + x0 + 1) * 3;
+        int idx01 = ((y0 + 1) * width + x0) * 3;
+        int idx11 = ((y0 + 1) * width + x0 + 1) * 3;
+
+        #define BILERP(c) \\
+            (src[idx00 + c] * (1-fx) * (1-fy) + \\
+             src[idx10 + c] * fx * (1-fy) + \\
+             src[idx01 + c] * (1-fx) * fy + \\
+             src[idx11 + c] * fx * fy)
+
+        r = BILERP(0);
+        g = BILERP(1);
+        b = BILERP(2);
+    }
+
+'''
+
+    # Add color transforms
+    for i, effect in enumerate(effects):
+        op = effect['op']
+
+        if op == 'hue_shift':
+            degrees = effect.get('degrees', 0)
+            code += f'''
+    // Hue shift {i}
+    {{
+        float shift = {degrees}f;
+        float rf = r / 255.0f;
+        float gf = g / 255.0f;
+        float bf = b / 255.0f;
+
+        float max_c = fmaxf(rf, fmaxf(gf, bf));
+        float min_c = fminf(rf, fminf(gf, bf));
+        float delta = max_c - min_c;
+
+        float h = 0, s = 0, v = max_c;
+
+        if (delta > 0.00001f) {{
+            s = delta / max_c;
+            if (rf >= max_c) h = (gf - bf) / delta;
+            else if (gf >= max_c) h = 2.0f + (bf - rf) / delta;
+            else h = 4.0f + (rf - gf) / delta;
+            h *= 60.0f;
+            if (h < 0) h += 360.0f;
+        }}
+
+        h = fmodf(h + shift + 360.0f, 360.0f);
+
+        float c = v * s;
+        float x_val = c * (1 - fabsf(fmodf(h / 60.0f, 2.0f) - 1));
+        float m = v - c;
+
+        float r2, g2, b2;
+        if (h < 60) {{ r2 = c; g2 = x_val; b2 = 0; }}
+        else if (h < 120) {{ r2 = x_val; g2 = c; b2 = 0; }}
+        else if (h < 180) {{ r2 = 0; g2 = c; b2 = x_val; }}
+        else if (h < 240) {{ r2 = 0; g2 = x_val; b2 = c; }}
+        else if (h < 300) {{ r2 = x_val; g2 = 0; b2 = c; }}
+        else {{ r2 = c; g2 = 0; b2 = x_val; }}
+
+        r = (r2 + m) * 255.0f;
+        g = (g2 + m) * 255.0f;
+        b = (b2 + m) * 255.0f;
+    }}
+'''
+        elif op == 'brightness':
+            code += '''
+    // Brightness
+    {
+        r *= brightness_factor;
+        g *= brightness_factor;
+        b *= brightness_factor;
+    }
+'''
+
+    # Write output
+    code += '''
+    // Write output
+    int dst_idx = (y * width + x) * 3;
+    dst[dst_idx] = (unsigned char)fminf(255.0f, fmaxf(0.0f, r));
+    dst[dst_idx + 1] = (unsigned char)fminf(255.0f, fmaxf(0.0f, g));
+    dst[dst_idx + 2] = (unsigned char)fminf(255.0f, fmaxf(0.0f, b));
+}
+'''
+
+    return code
+
+
 # Test the compiler
 if __name__ == '__main__':
+    import time
+
     print("[sexp_to_cuda] Testing fused kernel compiler...")
+    print("=" * 60)
 
     # Define a test pipeline
     effects = [
         {'op': 'rotate', 'angle': 45.0},
-        {'op': 'zoom', 'amount': 1.2},
         {'op': 'hue_shift', 'degrees': 30.0},
-        {'op': 'ripple', 'amplitude': 10, 'frequency': 8, 'decay': 2, 'phase': 0, 'center_x': 960, 'center_y': 540},
+        {'op': 'ripple', 'amplitude': 15, 'frequency': 10, 'decay': 2, 'phase': 0, 'center_x': 960, 'center_y': 540},
+        {'op': 'brightness', 'factor': 1.0},
     ]
 
-    # Compile
-    pipeline = compile_frame_pipeline(effects, 1920, 1080)
-
-    # Test with dummy frame
-    import time
     frame = cp.random.randint(0, 255, (1080, 1920, 3), dtype=cp.uint8)
 
+    # ===== Test 1: Standard fused kernel (params passed from Python) =====
+    print("\n[Test 1] Standard fused kernel (Python computes params)")
+    pipeline = compile_frame_pipeline(effects, 1920, 1080)
+
     # Warmup
     output = pipeline(frame)
     cp.cuda.Stream.null.synchronize()
 
-    # Benchmark
+    # Benchmark with Python param computation
     start = time.time()
     for i in range(100):
-        output = pipeline(frame, rotate_angle=i, ripple_phase=i*0.1)
+        # Simulate Python computing params (like sexp interpreter does)
+        import math
+        t = i / 30.0
+        angle = t * 30
+        phase = t * 2
+        brightness = 0.8 + 0.4 * math.sin(t * 2)
+        output = pipeline(frame, rotate_angle=angle, ripple_phase=phase)
     cp.cuda.Stream.null.synchronize()
     elapsed = time.time() - start
 
-    print(f"Fused kernel: {elapsed/100*1000:.2f}ms per frame")
-    print(f"That's {100/elapsed:.0f} fps potential!")
+    print(f"  Time: {elapsed/100*1000:.2f}ms per frame")
+    print(f"  FPS:  {100/elapsed:.0f}")
+
+    # ===== Test 2: Autonomous kernel (GPU computes everything) =====
+    print("\n[Test 2] Autonomous kernel (GPU computes ALL params)")
+
+    dynamic_expressions = {
+        'rotate_angle': 't * 30.0f',
+        'ripple_phase': 't * 2.0f',
+        'brightness_factor': '0.8f + 0.4f * sinf(t * 2.0f)',
+    }
+
+    auto_pipeline = compile_autonomous_pipeline(effects, 1920, 1080, dynamic_expressions)
+
+    # Warmup
+    output = auto_pipeline(frame, 0, 30.0)
+    cp.cuda.Stream.null.synchronize()
+
+    # Benchmark - NO Python computation in loop!
+    start = time.time()
+    for i in range(100):
+        output = auto_pipeline(frame, i, 30.0)  # Just pass frame_num!
+    cp.cuda.Stream.null.synchronize()
+    elapsed = time.time() - start
+
+    print(f"  Time: {elapsed/100*1000:.2f}ms per frame")
+    print(f"  FPS:  {100/elapsed:.0f}")
+
+    print("\n" + "=" * 60)
+    print("Autonomous kernel eliminates Python from hot path!")