Import L1 (celery) as l1/

2026-02-24 23:07:19 +00:00
parent 3ca1c14432 80c94ebea7
commit 4dff4cfafb
225 changed files with 57298 additions and 0 deletions
--- a/l1/sexp_effects/primitive_libs/init.py
+++ b/l1/sexp_effects/primitive_libs/init.py
@@ -0,0 +1,102 @@
+"""
+Primitive Libraries System
+
+Provides modular loading of primitives. Core primitives are always available,
+additional primitive libraries can be loaded on-demand with scoped availability.
+
+Usage in sexp:
+    ;; Load at recipe level - available throughout
+    (primitives math :path "primitive_libs/math.py")
+
+    ;; Or use with-primitives for scoped access
+    (with-primitives "image"
+      (blur frame 3))  ;; blur only available inside
+
+    ;; Nested scopes work
+    (with-primitives "math"
+      (with-primitives "color"
+        (hue-shift frame (* (sin t) 30))))
+
+Library file format (primitive_libs/math.py):
+    import math
+
+    def prim_sin(x): return math.sin(x)
+    def prim_cos(x): return math.cos(x)
+
+    PRIMITIVES = {
+        'sin': prim_sin,
+        'cos': prim_cos,
+    }
+"""
+
+import importlib.util
+from pathlib import Path
+from typing import Dict, Callable, Any, Optional
+
+# Cache of loaded primitive libraries
+_library_cache: Dict[str, Dict[str, Any]] = {}
+
+# Core primitives - always available, cannot be overridden
+CORE_PRIMITIVES: Dict[str, Any] = {}
+
+
+def register_core_primitive(name: str, fn: Callable):
+    """Register a core primitive that's always available."""
+    CORE_PRIMITIVES[name] = fn
+
+
+def load_primitive_library(name: str, path: Optional[str] = None) -> Dict[str, Any]:
+    """
+    Load a primitive library by name or path.
+
+    Args:
+        name: Library name (e.g., "math", "image", "color")
+        path: Optional explicit path to library file
+
+    Returns:
+        Dict of primitive name -> function
+    """
+    # Check cache first
+    cache_key = path or name
+    if cache_key in _library_cache:
+        return _library_cache[cache_key]
+
+    # Find library file
+    if path:
+        lib_path = Path(path)
+    else:
+        # Look in standard locations
+        lib_dir = Path(__file__).parent
+        lib_path = lib_dir / f"{name}.py"
+
+        if not lib_path.exists():
+            raise ValueError(f"Primitive library '{name}' not found at {lib_path}")
+
+    if not lib_path.exists():
+        raise ValueError(f"Primitive library file not found: {lib_path}")
+
+    # Load the module
+    spec = importlib.util.spec_from_file_location(f"prim_lib_{name}", lib_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+
+    # Get PRIMITIVES dict from module
+    if not hasattr(module, 'PRIMITIVES'):
+        raise ValueError(f"Primitive library '{name}' missing PRIMITIVES dict")
+
+    primitives = module.PRIMITIVES
+
+    # Cache and return
+    _library_cache[cache_key] = primitives
+    return primitives
+
+
+def get_library_names() -> list:
+    """Get names of available primitive libraries."""
+    lib_dir = Path(__file__).parent
+    return [p.stem for p in lib_dir.glob("*.py") if p.stem != "__init__"]
+
+
+def clear_cache():
+    """Clear the library cache (useful for testing)."""
+    _library_cache.clear()
--- a/l1/sexp_effects/primitive_libs/arrays.py
+++ b/l1/sexp_effects/primitive_libs/arrays.py
@@ -0,0 +1,196 @@
+"""
+Array Primitives Library
+
+Vectorized operations on numpy arrays for coordinate transformations.
+"""
+import numpy as np
+
+
+# Arithmetic
+def prim_arr_add(a, b):
+    return np.add(a, b)
+
+
+def prim_arr_sub(a, b):
+    return np.subtract(a, b)
+
+
+def prim_arr_mul(a, b):
+    return np.multiply(a, b)
+
+
+def prim_arr_div(a, b):
+    return np.divide(a, b)
+
+
+def prim_arr_mod(a, b):
+    return np.mod(a, b)
+
+
+def prim_arr_neg(a):
+    return np.negative(a)
+
+
+# Math functions
+def prim_arr_sin(a):
+    return np.sin(a)
+
+
+def prim_arr_cos(a):
+    return np.cos(a)
+
+
+def prim_arr_tan(a):
+    return np.tan(a)
+
+
+def prim_arr_sqrt(a):
+    return np.sqrt(np.maximum(a, 0))
+
+
+def prim_arr_pow(a, b):
+    return np.power(a, b)
+
+
+def prim_arr_abs(a):
+    return np.abs(a)
+
+
+def prim_arr_exp(a):
+    return np.exp(a)
+
+
+def prim_arr_log(a):
+    return np.log(np.maximum(a, 1e-10))
+
+
+def prim_arr_atan2(y, x):
+    return np.arctan2(y, x)
+
+
+# Comparison / selection
+def prim_arr_min(a, b):
+    return np.minimum(a, b)
+
+
+def prim_arr_max(a, b):
+    return np.maximum(a, b)
+
+
+def prim_arr_clip(a, lo, hi):
+    return np.clip(a, lo, hi)
+
+
+def prim_arr_where(cond, a, b):
+    return np.where(cond, a, b)
+
+
+def prim_arr_floor(a):
+    return np.floor(a)
+
+
+def prim_arr_ceil(a):
+    return np.ceil(a)
+
+
+def prim_arr_round(a):
+    return np.round(a)
+
+
+# Interpolation
+def prim_arr_lerp(a, b, t):
+    return a + (b - a) * t
+
+
+def prim_arr_smoothstep(edge0, edge1, x):
+    t = prim_arr_clip((x - edge0) / (edge1 - edge0), 0.0, 1.0)
+    return t * t * (3 - 2 * t)
+
+
+# Creation
+def prim_arr_zeros(shape):
+    return np.zeros(shape, dtype=np.float32)
+
+
+def prim_arr_ones(shape):
+    return np.ones(shape, dtype=np.float32)
+
+
+def prim_arr_full(shape, value):
+    return np.full(shape, value, dtype=np.float32)
+
+
+def prim_arr_arange(start, stop, step=1):
+    return np.arange(start, stop, step, dtype=np.float32)
+
+
+def prim_arr_linspace(start, stop, num):
+    return np.linspace(start, stop, num, dtype=np.float32)
+
+
+def prim_arr_meshgrid(x, y):
+    return np.meshgrid(x, y)
+
+
+# Coordinate transforms
+def prim_polar_from_center(map_x, map_y, cx, cy):
+    """Convert Cartesian to polar coordinates centered at (cx, cy)."""
+    dx = map_x - cx
+    dy = map_y - cy
+    r = np.sqrt(dx**2 + dy**2)
+    theta = np.arctan2(dy, dx)
+    return (r, theta)
+
+
+def prim_cart_from_polar(r, theta, cx, cy):
+    """Convert polar to Cartesian, adding center offset."""
+    x = r * np.cos(theta) + cx
+    y = r * np.sin(theta) + cy
+    return (x, y)
+
+
+PRIMITIVES = {
+    # Arithmetic
+    'arr+': prim_arr_add,
+    'arr-': prim_arr_sub,
+    'arr*': prim_arr_mul,
+    'arr/': prim_arr_div,
+    'arr-mod': prim_arr_mod,
+    'arr-neg': prim_arr_neg,
+
+    # Math
+    'arr-sin': prim_arr_sin,
+    'arr-cos': prim_arr_cos,
+    'arr-tan': prim_arr_tan,
+    'arr-sqrt': prim_arr_sqrt,
+    'arr-pow': prim_arr_pow,
+    'arr-abs': prim_arr_abs,
+    'arr-exp': prim_arr_exp,
+    'arr-log': prim_arr_log,
+    'arr-atan2': prim_arr_atan2,
+
+    # Selection
+    'arr-min': prim_arr_min,
+    'arr-max': prim_arr_max,
+    'arr-clip': prim_arr_clip,
+    'arr-where': prim_arr_where,
+    'arr-floor': prim_arr_floor,
+    'arr-ceil': prim_arr_ceil,
+    'arr-round': prim_arr_round,
+
+    # Interpolation
+    'arr-lerp': prim_arr_lerp,
+    'arr-smoothstep': prim_arr_smoothstep,
+
+    # Creation
+    'arr-zeros': prim_arr_zeros,
+    'arr-ones': prim_arr_ones,
+    'arr-full': prim_arr_full,
+    'arr-arange': prim_arr_arange,
+    'arr-linspace': prim_arr_linspace,
+    'arr-meshgrid': prim_arr_meshgrid,
+
+    # Coordinates
+    'polar-from-center': prim_polar_from_center,
+    'cart-from-polar': prim_cart_from_polar,
+}
--- a/l1/sexp_effects/primitive_libs/ascii.py
+++ b/l1/sexp_effects/primitive_libs/ascii.py
@@ -0,0 +1,388 @@
+"""
+ASCII Art Primitives Library
+
+ASCII art rendering with per-zone expression evaluation and cell effects.
+"""
+import numpy as np
+import cv2
+from PIL import Image, ImageDraw, ImageFont
+from typing import Any, Dict, List, Optional, Callable
+import colorsys
+
+
+# Character sets
+CHAR_SETS = {
+    "standard": " .:-=+*#%@",
+    "blocks": " ░▒▓█",
+    "simple": " .:oO@",
+    "digits": "0123456789",
+    "binary": "01",
+    "ascii": " `.-':_,^=;><+!rc*/z?sLTv)J7(|Fi{C}fI31tlu[neoZ5Yxjya]2ESwqkP6h9d4VpOGbUAKXHm8RD#$Bg0MNWQ%&@",
+}
+
+# Default font
+_default_font = None
+
+
+def _get_font(size: int):
+    """Get monospace font at given size."""
+    global _default_font
+    try:
+        return ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf", size)
+    except:
+        return ImageFont.load_default()
+
+
+def _parse_color(color_str: str) -> tuple:
+    """Parse color string to RGB tuple."""
+    if color_str.startswith('#'):
+        hex_color = color_str[1:]
+        if len(hex_color) == 3:
+            hex_color = ''.join(c*2 for c in hex_color)
+        return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
+
+    colors = {
+        'black': (0, 0, 0), 'white': (255, 255, 255),
+        'red': (255, 0, 0), 'green': (0, 255, 0), 'blue': (0, 0, 255),
+        'yellow': (255, 255, 0), 'cyan': (0, 255, 255), 'magenta': (255, 0, 255),
+        'gray': (128, 128, 128), 'grey': (128, 128, 128),
+    }
+    return colors.get(color_str.lower(), (0, 0, 0))
+
+
+def _cell_sample(frame: np.ndarray, cell_size: int):
+    """Sample frame into cells, returning colors and luminances.
+
+    Uses cv2.resize with INTER_AREA (pixel-area averaging) which is
+    ~25x faster than numpy reshape+mean for block downsampling.
+    """
+    h, w = frame.shape[:2]
+    rows = h // cell_size
+    cols = w // cell_size
+
+    # Crop to exact grid then block-average via cv2 area interpolation.
+    cropped = frame[:rows * cell_size, :cols * cell_size]
+    colors = cv2.resize(cropped, (cols, rows), interpolation=cv2.INTER_AREA)
+
+    luminances = ((0.299 * colors[:, :, 0] +
+                    0.587 * colors[:, :, 1] +
+                    0.114 * colors[:, :, 2]) / 255.0).astype(np.float32)
+
+    return colors, luminances
+
+
+def _luminance_to_char(lum: float, alphabet: str, contrast: float) -> str:
+    """Map luminance to character."""
+    chars = CHAR_SETS.get(alphabet, alphabet)
+    lum = ((lum - 0.5) * contrast + 0.5)
+    lum = max(0, min(1, lum))
+    idx = int(lum * (len(chars) - 1))
+    return chars[idx]
+
+
+def _render_char_cell(char: str, cell_size: int, color: tuple, bg_color: tuple) -> np.ndarray:
+    """Render a single character to a cell image."""
+    img = Image.new('RGB', (cell_size, cell_size), bg_color)
+    draw = ImageDraw.Draw(img)
+    font = _get_font(cell_size)
+
+    # Center the character
+    bbox = draw.textbbox((0, 0), char, font=font)
+    text_w = bbox[2] - bbox[0]
+    text_h = bbox[3] - bbox[1]
+    x = (cell_size - text_w) // 2
+    y = (cell_size - text_h) // 2 - bbox[1]
+
+    draw.text((x, y), char, fill=color, font=font)
+    return np.array(img)
+
+
+def prim_ascii_fx_zone(
+    frame: np.ndarray,
+    cols: int = 80,
+    char_size: int = None,
+    alphabet: str = "standard",
+    color_mode: str = "color",
+    background: str = "black",
+    contrast: float = 1.5,
+    char_hue = None,
+    char_saturation = None,
+    char_brightness = None,
+    char_scale = None,
+    char_rotation = None,
+    char_jitter = None,
+    cell_effect = None,
+    energy: float = None,
+    rotation_scale: float = 0,
+    _interp = None,
+    _env = None,
+    **extra_params
+) -> np.ndarray:
+    """
+    Render frame as ASCII art with per-zone effects.
+
+    Args:
+        frame: Input image
+        cols: Number of character columns
+        char_size: Cell size in pixels (overrides cols if set)
+        alphabet: Character set name or custom string
+        color_mode: "color", "mono", "invert", or color name
+        background: Background color name or hex
+        contrast: Contrast for character selection
+        char_hue/saturation/brightness/scale/rotation/jitter: Per-zone expressions
+        cell_effect: Lambda (cell, zone) -> cell for per-cell effects
+        energy: Energy value from audio analysis
+        rotation_scale: Max rotation degrees
+        _interp: Interpreter (auto-injected)
+        _env: Environment (auto-injected)
+        **extra_params: Additional params passed to zone dict
+    """
+    h, w = frame.shape[:2]
+
+    # Calculate cell size
+    if char_size is None or char_size == 0:
+        cell_size = max(4, w // cols)
+    else:
+        cell_size = max(4, int(char_size))
+
+    # Sample cells
+    colors, luminances = _cell_sample(frame, cell_size)
+    rows, cols_actual = luminances.shape
+
+    # Parse background color
+    bg_color = _parse_color(background)
+
+    # Create output image
+    out_h = rows * cell_size
+    out_w = cols_actual * cell_size
+    output = np.full((out_h, out_w, 3), bg_color, dtype=np.uint8)
+
+    # Check if we have cell_effect
+    has_cell_effect = cell_effect is not None
+
+    # Process each cell
+    for r in range(rows):
+        for c in range(cols_actual):
+            lum = luminances[r, c]
+            cell_color = tuple(colors[r, c])
+
+            # Build zone context
+            zone = {
+                'row': r,
+                'col': c,
+                'row-norm': r / max(1, rows - 1),
+                'col-norm': c / max(1, cols_actual - 1),
+                'lum': float(lum),
+                'r': cell_color[0] / 255,
+                'g': cell_color[1] / 255,
+                'b': cell_color[2] / 255,
+                'cell_size': cell_size,
+            }
+
+            # Add HSV
+            r_f, g_f, b_f = cell_color[0]/255, cell_color[1]/255, cell_color[2]/255
+            hsv = colorsys.rgb_to_hsv(r_f, g_f, b_f)
+            zone['hue'] = hsv[0] * 360
+            zone['sat'] = hsv[1]
+
+            # Add energy and rotation_scale
+            if energy is not None:
+                zone['energy'] = energy
+            zone['rotation_scale'] = rotation_scale
+
+            # Add extra params
+            for k, v in extra_params.items():
+                if isinstance(v, (int, float, str, bool)) or v is None:
+                    zone[k] = v
+
+            # Get character
+            char = _luminance_to_char(lum, alphabet, contrast)
+            zone['char'] = char
+
+            # Determine cell color based on mode
+            if color_mode == "mono":
+                render_color = (255, 255, 255)
+            elif color_mode == "invert":
+                render_color = tuple(255 - c for c in cell_color)
+            elif color_mode == "color":
+                render_color = cell_color
+            else:
+                render_color = _parse_color(color_mode)
+
+            zone['color'] = render_color
+
+            # Render character to cell
+            cell_img = _render_char_cell(char, cell_size, render_color, bg_color)
+
+            # Apply cell_effect if provided
+            if has_cell_effect and _interp is not None:
+                cell_img = _apply_cell_effect(cell_img, zone, cell_effect, _interp, _env, extra_params)
+
+            # Paste cell to output
+            y1, y2 = r * cell_size, (r + 1) * cell_size
+            x1, x2 = c * cell_size, (c + 1) * cell_size
+            output[y1:y2, x1:x2] = cell_img
+
+    # Resize to match input dimensions
+    if output.shape[:2] != frame.shape[:2]:
+        output = cv2.resize(output, (w, h), interpolation=cv2.INTER_LINEAR)
+
+    return output
+
+
+def _apply_cell_effect(cell_img, zone, cell_effect, interp, env, extra_params):
+    """Apply cell_effect lambda to a cell image.
+
+    cell_effect is a Lambda object with params and body.
+    We create a child environment with zone variables and cell,
+    then evaluate the lambda body.
+    """
+    # Get Environment class from the interpreter's module
+    Environment = type(env)
+
+    # Create child environment with zone variables
+    cell_env = Environment(env)
+
+    # Bind zone variables
+    for k, v in zone.items():
+        cell_env.set(k, v)
+
+    # Also bind with zone- prefix for consistency
+    cell_env.set('zone-row', zone.get('row', 0))
+    cell_env.set('zone-col', zone.get('col', 0))
+    cell_env.set('zone-row-norm', zone.get('row-norm', 0))
+    cell_env.set('zone-col-norm', zone.get('col-norm', 0))
+    cell_env.set('zone-lum', zone.get('lum', 0))
+    cell_env.set('zone-sat', zone.get('sat', 0))
+    cell_env.set('zone-hue', zone.get('hue', 0))
+    cell_env.set('zone-r', zone.get('r', 0))
+    cell_env.set('zone-g', zone.get('g', 0))
+    cell_env.set('zone-b', zone.get('b', 0))
+
+    # Inject loaded effects as callable functions
+    if hasattr(interp, 'effects'):
+        for effect_name in interp.effects:
+            def make_effect_fn(name):
+                def effect_fn(frame, *args):
+                    params = {}
+                    if name == 'blur' and len(args) >= 1:
+                        params['radius'] = args[0]
+                    elif name == 'rotate' and len(args) >= 1:
+                        params['angle'] = args[0]
+                    elif name == 'brightness' and len(args) >= 1:
+                        params['amount'] = args[0]
+                    elif name == 'contrast' and len(args) >= 1:
+                        params['amount'] = args[0]
+                    elif name == 'saturation' and len(args) >= 1:
+                        params['amount'] = args[0]
+                    elif name == 'hue_shift' and len(args) >= 1:
+                        params['degrees'] = args[0]
+                    elif name == 'rgb_split' and len(args) >= 2:
+                        params['offset_x'] = args[0]
+                        params['offset_y'] = args[1]
+                    elif name == 'pixelate' and len(args) >= 1:
+                        params['size'] = args[0]
+                    elif name == 'invert':
+                        pass
+                    result, _ = interp.run_effect(name, frame, params, {})
+                    return result
+                return effect_fn
+            cell_env.set(effect_name, make_effect_fn(effect_name))
+
+    # Bind cell image and zone dict
+    cell_env.set('cell', cell_img)
+    cell_env.set('zone', zone)
+
+    # Evaluate the cell_effect lambda
+    # Lambda has params and body - we need to bind the params then evaluate
+    if hasattr(cell_effect, 'params') and hasattr(cell_effect, 'body'):
+        # Bind lambda parameters: (lambda [cell zone] body)
+        if len(cell_effect.params) >= 1:
+            cell_env.set(cell_effect.params[0], cell_img)
+        if len(cell_effect.params) >= 2:
+            cell_env.set(cell_effect.params[1], zone)
+
+        result = interp.eval(cell_effect.body, cell_env)
+    elif isinstance(cell_effect, list):
+        # Raw S-expression lambda like (lambda [cell zone] body) or (fn [cell zone] body)
+        # Check if it's a lambda expression
+        head = cell_effect[0] if cell_effect else None
+        head_name = head.name if head and hasattr(head, 'name') else str(head) if head else None
+        is_lambda = head_name in ('lambda', 'fn')
+
+        if is_lambda:
+            # (lambda [params...] body)
+            params = cell_effect[1] if len(cell_effect) > 1 else []
+            body = cell_effect[2] if len(cell_effect) > 2 else None
+
+            # Bind lambda parameters
+            if isinstance(params, list) and len(params) >= 1:
+                param_name = params[0].name if hasattr(params[0], 'name') else str(params[0])
+                cell_env.set(param_name, cell_img)
+            if isinstance(params, list) and len(params) >= 2:
+                param_name = params[1].name if hasattr(params[1], 'name') else str(params[1])
+                cell_env.set(param_name, zone)
+
+            result = interp.eval(body, cell_env) if body else cell_img
+        else:
+            # Some other expression - just evaluate it
+            result = interp.eval(cell_effect, cell_env)
+    elif callable(cell_effect):
+        # It's a callable
+        result = cell_effect(cell_img, zone)
+    else:
+        raise ValueError(f"cell_effect must be a Lambda, list, or callable, got {type(cell_effect)}")
+
+    if isinstance(result, np.ndarray) and result.shape == cell_img.shape:
+        return result
+    elif isinstance(result, np.ndarray):
+        # Shape mismatch - resize to fit
+        result = cv2.resize(result, (cell_img.shape[1], cell_img.shape[0]))
+        return result
+
+    raise ValueError(f"cell_effect must return an image array, got {type(result)}")
+
+
+def _get_legacy_ascii_primitives():
+    """Import ASCII primitives from legacy primitives module.
+
+    These are loaded lazily to avoid import issues during module loading.
+    By the time a primitive library is loaded, sexp_effects.primitives
+    is already in sys.modules (imported by sexp_effects.__init__).
+    """
+    from sexp_effects.primitives import (
+        prim_cell_sample,
+        prim_luminance_to_chars,
+        prim_render_char_grid,
+        prim_render_char_grid_fx,
+        prim_alphabet_char,
+        prim_alphabet_length,
+        prim_map_char_grid,
+        prim_map_colors,
+        prim_make_char_grid,
+        prim_set_char,
+        prim_get_char,
+        prim_char_grid_dimensions,
+        cell_sample_extended,
+    )
+    return {
+        'cell-sample': prim_cell_sample,
+        'cell-sample-extended': cell_sample_extended,
+        'luminance-to-chars': prim_luminance_to_chars,
+        'render-char-grid': prim_render_char_grid,
+        'render-char-grid-fx': prim_render_char_grid_fx,
+        'alphabet-char': prim_alphabet_char,
+        'alphabet-length': prim_alphabet_length,
+        'map-char-grid': prim_map_char_grid,
+        'map-colors': prim_map_colors,
+        'make-char-grid': prim_make_char_grid,
+        'set-char': prim_set_char,
+        'get-char': prim_get_char,
+        'char-grid-dimensions': prim_char_grid_dimensions,
+    }
+
+
+PRIMITIVES = {
+    'ascii-fx-zone': prim_ascii_fx_zone,
+    **_get_legacy_ascii_primitives(),
+}
--- a/l1/sexp_effects/primitive_libs/blending.py
+++ b/l1/sexp_effects/primitive_libs/blending.py
@@ -0,0 +1,116 @@
+"""
+Blending Primitives Library
+
+Image blending and compositing operations.
+"""
+import numpy as np
+
+
+def prim_blend_images(a, b, alpha):
+    """Blend two images: a * (1-alpha) + b * alpha."""
+    alpha = max(0.0, min(1.0, alpha))
+    return (a.astype(float) * (1 - alpha) + b.astype(float) * alpha).astype(np.uint8)
+
+
+def prim_blend_mode(a, b, mode):
+    """Blend using Photoshop-style blend modes."""
+    a = a.astype(float) / 255
+    b = b.astype(float) / 255
+
+    if mode == "multiply":
+        result = a * b
+    elif mode == "screen":
+        result = 1 - (1 - a) * (1 - b)
+    elif mode == "overlay":
+        mask = a < 0.5
+        result = np.where(mask, 2 * a * b, 1 - 2 * (1 - a) * (1 - b))
+    elif mode == "soft-light":
+        mask = b < 0.5
+        result = np.where(mask,
+                         a - (1 - 2 * b) * a * (1 - a),
+                         a + (2 * b - 1) * (np.sqrt(a) - a))
+    elif mode == "hard-light":
+        mask = b < 0.5
+        result = np.where(mask, 2 * a * b, 1 - 2 * (1 - a) * (1 - b))
+    elif mode == "color-dodge":
+        result = np.clip(a / (1 - b + 0.001), 0, 1)
+    elif mode == "color-burn":
+        result = 1 - np.clip((1 - a) / (b + 0.001), 0, 1)
+    elif mode == "difference":
+        result = np.abs(a - b)
+    elif mode == "exclusion":
+        result = a + b - 2 * a * b
+    elif mode == "add":
+        result = np.clip(a + b, 0, 1)
+    elif mode == "subtract":
+        result = np.clip(a - b, 0, 1)
+    elif mode == "darken":
+        result = np.minimum(a, b)
+    elif mode == "lighten":
+        result = np.maximum(a, b)
+    else:
+        # Default to normal (just return b)
+        result = b
+
+    return (result * 255).astype(np.uint8)
+
+
+def prim_mask(img, mask_img):
+    """Apply grayscale mask to image (white=opaque, black=transparent)."""
+    if len(mask_img.shape) == 3:
+        mask = mask_img[:, :, 0].astype(float) / 255
+    else:
+        mask = mask_img.astype(float) / 255
+
+    mask = mask[:, :, np.newaxis]
+    return (img.astype(float) * mask).astype(np.uint8)
+
+
+def prim_alpha_composite(base, overlay, alpha_channel):
+    """Composite overlay onto base using alpha channel."""
+    if len(alpha_channel.shape) == 3:
+        alpha = alpha_channel[:, :, 0].astype(float) / 255
+    else:
+        alpha = alpha_channel.astype(float) / 255
+
+    alpha = alpha[:, :, np.newaxis]
+    result = base.astype(float) * (1 - alpha) + overlay.astype(float) * alpha
+    return result.astype(np.uint8)
+
+
+def prim_overlay(base, overlay, x, y, alpha=1.0):
+    """Overlay image at position (x, y) with optional alpha."""
+    result = base.copy()
+    x, y = int(x), int(y)
+    oh, ow = overlay.shape[:2]
+    bh, bw = base.shape[:2]
+
+    # Clip to bounds
+    sx1 = max(0, -x)
+    sy1 = max(0, -y)
+    dx1 = max(0, x)
+    dy1 = max(0, y)
+    sx2 = min(ow, bw - x)
+    sy2 = min(oh, bh - y)
+
+    if sx2 > sx1 and sy2 > sy1:
+        src = overlay[sy1:sy2, sx1:sx2]
+        dst = result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)]
+        blended = (dst.astype(float) * (1 - alpha) + src.astype(float) * alpha)
+        result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)] = blended.astype(np.uint8)
+
+    return result
+
+
+PRIMITIVES = {
+    # Basic blending
+    'blend-images': prim_blend_images,
+    'blend-mode': prim_blend_mode,
+
+    # Masking
+    'mask': prim_mask,
+    'alpha-composite': prim_alpha_composite,
+
+    # Overlay
+    'overlay': prim_overlay,
+}
--- a/l1/sexp_effects/primitive_libs/blending_gpu.py
+++ b/l1/sexp_effects/primitive_libs/blending_gpu.py
@@ -0,0 +1,220 @@
+"""
+GPU-Accelerated Blending Primitives Library
+
+Uses CuPy for CUDA-accelerated image blending and compositing.
+Keeps frames on GPU when STREAMING_GPU_PERSIST=1 for maximum performance.
+"""
+import os
+import numpy as np
+
+# Try to import CuPy for GPU acceleration
+try:
+    import cupy as cp
+    GPU_AVAILABLE = True
+    print("[blending_gpu] CuPy GPU acceleration enabled")
+except ImportError:
+    cp = np
+    GPU_AVAILABLE = False
+    print("[blending_gpu] CuPy not available, using CPU fallback")
+
+# GPU persistence mode - keep frames on GPU between operations
+GPU_PERSIST = os.environ.get("STREAMING_GPU_PERSIST", "0") == "1"
+if GPU_AVAILABLE and GPU_PERSIST:
+    print("[blending_gpu] GPU persistence enabled - frames stay on GPU")
+
+
+def _to_gpu(img):
+    """Move image to GPU if available."""
+    if GPU_AVAILABLE and not isinstance(img, cp.ndarray):
+        return cp.asarray(img)
+    return img
+
+
+def _to_cpu(img):
+    """Move image back to CPU (only if GPU_PERSIST is disabled)."""
+    if not GPU_PERSIST and GPU_AVAILABLE and isinstance(img, cp.ndarray):
+        return cp.asnumpy(img)
+    return img
+
+
+def _get_xp(img):
+    """Get the array module (numpy or cupy) for the given image."""
+    if GPU_AVAILABLE and isinstance(img, cp.ndarray):
+        return cp
+    return np
+
+
+def prim_blend_images(a, b, alpha):
+    """Blend two images: a * (1-alpha) + b * alpha."""
+    alpha = max(0.0, min(1.0, float(alpha)))
+
+    if GPU_AVAILABLE:
+        a_gpu = _to_gpu(a)
+        b_gpu = _to_gpu(b)
+        result = (a_gpu.astype(cp.float32) * (1 - alpha) + b_gpu.astype(cp.float32) * alpha).astype(cp.uint8)
+        return _to_cpu(result)
+
+    return (a.astype(float) * (1 - alpha) + b.astype(float) * alpha).astype(np.uint8)
+
+
+def prim_blend_mode(a, b, mode):
+    """Blend using Photoshop-style blend modes."""
+    if GPU_AVAILABLE:
+        a_gpu = _to_gpu(a).astype(cp.float32) / 255
+        b_gpu = _to_gpu(b).astype(cp.float32) / 255
+        xp = cp
+    else:
+        a_gpu = a.astype(float) / 255
+        b_gpu = b.astype(float) / 255
+        xp = np
+
+    if mode == "multiply":
+        result = a_gpu * b_gpu
+    elif mode == "screen":
+        result = 1 - (1 - a_gpu) * (1 - b_gpu)
+    elif mode == "overlay":
+        mask = a_gpu < 0.5
+        result = xp.where(mask, 2 * a_gpu * b_gpu, 1 - 2 * (1 - a_gpu) * (1 - b_gpu))
+    elif mode == "soft-light":
+        mask = b_gpu < 0.5
+        result = xp.where(mask,
+                         a_gpu - (1 - 2 * b_gpu) * a_gpu * (1 - a_gpu),
+                         a_gpu + (2 * b_gpu - 1) * (xp.sqrt(a_gpu) - a_gpu))
+    elif mode == "hard-light":
+        mask = b_gpu < 0.5
+        result = xp.where(mask, 2 * a_gpu * b_gpu, 1 - 2 * (1 - a_gpu) * (1 - b_gpu))
+    elif mode == "color-dodge":
+        result = xp.clip(a_gpu / (1 - b_gpu + 0.001), 0, 1)
+    elif mode == "color-burn":
+        result = 1 - xp.clip((1 - a_gpu) / (b_gpu + 0.001), 0, 1)
+    elif mode == "difference":
+        result = xp.abs(a_gpu - b_gpu)
+    elif mode == "exclusion":
+        result = a_gpu + b_gpu - 2 * a_gpu * b_gpu
+    elif mode == "add":
+        result = xp.clip(a_gpu + b_gpu, 0, 1)
+    elif mode == "subtract":
+        result = xp.clip(a_gpu - b_gpu, 0, 1)
+    elif mode == "darken":
+        result = xp.minimum(a_gpu, b_gpu)
+    elif mode == "lighten":
+        result = xp.maximum(a_gpu, b_gpu)
+    else:
+        # Default to normal (just return b)
+        result = b_gpu
+
+    result = (result * 255).astype(xp.uint8)
+    return _to_cpu(result)
+
+
+def prim_mask(img, mask_img):
+    """Apply grayscale mask to image (white=opaque, black=transparent)."""
+    if GPU_AVAILABLE:
+        img_gpu = _to_gpu(img)
+        mask_gpu = _to_gpu(mask_img)
+
+        if len(mask_gpu.shape) == 3:
+            mask = mask_gpu[:, :, 0].astype(cp.float32) / 255
+        else:
+            mask = mask_gpu.astype(cp.float32) / 255
+
+        mask = mask[:, :, cp.newaxis]
+        result = (img_gpu.astype(cp.float32) * mask).astype(cp.uint8)
+        return _to_cpu(result)
+
+    if len(mask_img.shape) == 3:
+        mask = mask_img[:, :, 0].astype(float) / 255
+    else:
+        mask = mask_img.astype(float) / 255
+
+    mask = mask[:, :, np.newaxis]
+    return (img.astype(float) * mask).astype(np.uint8)
+
+
+def prim_alpha_composite(base, overlay, alpha_channel):
+    """Composite overlay onto base using alpha channel."""
+    if GPU_AVAILABLE:
+        base_gpu = _to_gpu(base)
+        overlay_gpu = _to_gpu(overlay)
+        alpha_gpu = _to_gpu(alpha_channel)
+
+        if len(alpha_gpu.shape) == 3:
+            alpha = alpha_gpu[:, :, 0].astype(cp.float32) / 255
+        else:
+            alpha = alpha_gpu.astype(cp.float32) / 255
+
+        alpha = alpha[:, :, cp.newaxis]
+        result = base_gpu.astype(cp.float32) * (1 - alpha) + overlay_gpu.astype(cp.float32) * alpha
+        return _to_cpu(result.astype(cp.uint8))
+
+    if len(alpha_channel.shape) == 3:
+        alpha = alpha_channel[:, :, 0].astype(float) / 255
+    else:
+        alpha = alpha_channel.astype(float) / 255
+
+    alpha = alpha[:, :, np.newaxis]
+    result = base.astype(float) * (1 - alpha) + overlay.astype(float) * alpha
+    return result.astype(np.uint8)
+
+
+def prim_overlay(base, overlay, x, y, alpha=1.0):
+    """Overlay image at position (x, y) with optional alpha."""
+    if GPU_AVAILABLE:
+        base_gpu = _to_gpu(base)
+        overlay_gpu = _to_gpu(overlay)
+        result = base_gpu.copy()
+
+        x, y = int(x), int(y)
+        oh, ow = overlay_gpu.shape[:2]
+        bh, bw = base_gpu.shape[:2]
+
+        # Clip to bounds
+        sx1 = max(0, -x)
+        sy1 = max(0, -y)
+        dx1 = max(0, x)
+        dy1 = max(0, y)
+        sx2 = min(ow, bw - x)
+        sy2 = min(oh, bh - y)
+
+        if sx2 > sx1 and sy2 > sy1:
+            src = overlay_gpu[sy1:sy2, sx1:sx2]
+            dst = result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)]
+            blended = (dst.astype(cp.float32) * (1 - alpha) + src.astype(cp.float32) * alpha)
+            result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)] = blended.astype(cp.uint8)
+
+        return _to_cpu(result)
+
+    result = base.copy()
+    x, y = int(x), int(y)
+    oh, ow = overlay.shape[:2]
+    bh, bw = base.shape[:2]
+
+    # Clip to bounds
+    sx1 = max(0, -x)
+    sy1 = max(0, -y)
+    dx1 = max(0, x)
+    dy1 = max(0, y)
+    sx2 = min(ow, bw - x)
+    sy2 = min(oh, bh - y)
+
+    if sx2 > sx1 and sy2 > sy1:
+        src = overlay[sy1:sy2, sx1:sx2]
+        dst = result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)]
+        blended = (dst.astype(float) * (1 - alpha) + src.astype(float) * alpha)
+        result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)] = blended.astype(np.uint8)
+
+    return result
+
+
+PRIMITIVES = {
+    # Basic blending
+    'blend-images': prim_blend_images,
+    'blend-mode': prim_blend_mode,
+
+    # Masking
+    'mask': prim_mask,
+    'alpha-composite': prim_alpha_composite,
+
+    # Overlay
+    'overlay': prim_overlay,
+}
--- a/l1/sexp_effects/primitive_libs/color.py
+++ b/l1/sexp_effects/primitive_libs/color.py
@@ -0,0 +1,137 @@
+"""
+Color Primitives Library
+
+Color manipulation: RGB, HSV, blending, luminance.
+"""
+import numpy as np
+import colorsys
+
+
+def prim_rgb(r, g, b):
+    """Create RGB color as [r, g, b] (0-255)."""
+    return [int(max(0, min(255, r))),
+            int(max(0, min(255, g))),
+            int(max(0, min(255, b)))]
+
+
+def prim_red(c):
+    return c[0]
+
+
+def prim_green(c):
+    return c[1]
+
+
+def prim_blue(c):
+    return c[2]
+
+
+def prim_luminance(c):
+    """Perceived luminance (0-1) using standard weights."""
+    return (0.299 * c[0] + 0.587 * c[1] + 0.114 * c[2]) / 255
+
+
+def prim_rgb_to_hsv(c):
+    """Convert RGB [0-255] to HSV [h:0-360, s:0-1, v:0-1]."""
+    r, g, b = c[0] / 255, c[1] / 255, c[2] / 255
+    h, s, v = colorsys.rgb_to_hsv(r, g, b)
+    return [h * 360, s, v]
+
+
+def prim_hsv_to_rgb(hsv):
+    """Convert HSV [h:0-360, s:0-1, v:0-1] to RGB [0-255]."""
+    h, s, v = hsv[0] / 360, hsv[1], hsv[2]
+    r, g, b = colorsys.hsv_to_rgb(h, s, v)
+    return [int(r * 255), int(g * 255), int(b * 255)]
+
+
+def prim_rgb_to_hsl(c):
+    """Convert RGB [0-255] to HSL [h:0-360, s:0-1, l:0-1]."""
+    r, g, b = c[0] / 255, c[1] / 255, c[2] / 255
+    h, l, s = colorsys.rgb_to_hls(r, g, b)
+    return [h * 360, s, l]
+
+
+def prim_hsl_to_rgb(hsl):
+    """Convert HSL [h:0-360, s:0-1, l:0-1] to RGB [0-255]."""
+    h, s, l = hsl[0] / 360, hsl[1], hsl[2]
+    r, g, b = colorsys.hls_to_rgb(h, l, s)
+    return [int(r * 255), int(g * 255), int(b * 255)]
+
+
+def prim_blend_color(c1, c2, alpha):
+    """Blend two colors: c1 * (1-alpha) + c2 * alpha."""
+    return [int(c1[i] * (1 - alpha) + c2[i] * alpha) for i in range(3)]
+
+
+def prim_average_color(img):
+    """Get average color of an image."""
+    mean = np.mean(img, axis=(0, 1))
+    return [int(mean[0]), int(mean[1]), int(mean[2])]
+
+
+def prim_dominant_color(img, k=1):
+    """Get dominant color using k-means (simplified: just average for now)."""
+    return prim_average_color(img)
+
+
+def prim_invert_color(c):
+    """Invert a color."""
+    return [255 - c[0], 255 - c[1], 255 - c[2]]
+
+
+def prim_grayscale_color(c):
+    """Convert color to grayscale."""
+    gray = int(0.299 * c[0] + 0.587 * c[1] + 0.114 * c[2])
+    return [gray, gray, gray]
+
+
+def prim_saturate(c, amount):
+    """Adjust saturation of color. amount=0 is grayscale, 1 is unchanged, >1 is more saturated."""
+    hsv = prim_rgb_to_hsv(c)
+    hsv[1] = max(0, min(1, hsv[1] * amount))
+    return prim_hsv_to_rgb(hsv)
+
+
+def prim_brighten(c, amount):
+    """Adjust brightness. amount=0 is black, 1 is unchanged, >1 is brighter."""
+    return [int(max(0, min(255, c[i] * amount))) for i in range(3)]
+
+
+def prim_shift_hue(c, degrees):
+    """Shift hue by degrees."""
+    hsv = prim_rgb_to_hsv(c)
+    hsv[0] = (hsv[0] + degrees) % 360
+    return prim_hsv_to_rgb(hsv)
+
+
+PRIMITIVES = {
+    # Construction
+    'rgb': prim_rgb,
+
+    # Component access
+    'red': prim_red,
+    'green': prim_green,
+    'blue': prim_blue,
+    'luminance': prim_luminance,
+
+    # Color space conversion
+    'rgb->hsv': prim_rgb_to_hsv,
+    'hsv->rgb': prim_hsv_to_rgb,
+    'rgb->hsl': prim_rgb_to_hsl,
+    'hsl->rgb': prim_hsl_to_rgb,
+
+    # Blending
+    'blend-color': prim_blend_color,
+
+    # Analysis
+    'average-color': prim_average_color,
+    'dominant-color': prim_dominant_color,
+
+    # Manipulation
+    'invert-color': prim_invert_color,
+    'grayscale-color': prim_grayscale_color,
+    'saturate': prim_saturate,
+    'brighten': prim_brighten,
+    'shift-hue': prim_shift_hue,
+}
--- a/l1/sexp_effects/primitive_libs/color_ops.py
+++ b/l1/sexp_effects/primitive_libs/color_ops.py
@@ -0,0 +1,109 @@
+"""
+Color Operations Primitives Library
+
+Vectorized color adjustments: brightness, contrast, saturation, invert, HSV.
+These operate on entire images for fast processing.
+"""
+import numpy as np
+import cv2
+
+
+def _to_numpy(img):
+    """Convert GPU frames or CuPy arrays to numpy for CPU processing."""
+    # Handle GPUFrame objects
+    if hasattr(img, 'cpu'):
+        return img.cpu
+    # Handle CuPy arrays
+    if hasattr(img, 'get'):
+        return img.get()
+    return img
+
+
+def prim_adjust(img, brightness=0, contrast=1):
+    """Adjust brightness and contrast. Brightness: -255 to 255, Contrast: 0 to 3+."""
+    img = _to_numpy(img)
+    result = (img.astype(np.float32) - 128) * contrast + 128 + brightness
+    return np.clip(result, 0, 255).astype(np.uint8)
+
+
+def prim_mix_gray(img_raw, amount):
+    """Mix image with its grayscale version. 0=original, 1=grayscale."""
+    img = _to_numpy(img_raw)
+    gray = 0.299 * img[:, :, 0] + 0.587 * img[:, :, 1] + 0.114 * img[:, :, 2]
+    gray_rgb = np.stack([gray, gray, gray], axis=-1)
+    result = img.astype(np.float32) * (1 - amount) + gray_rgb * amount
+    return np.clip(result, 0, 255).astype(np.uint8)
+
+
+def prim_invert_img(img):
+    """Invert all pixel values."""
+    img = _to_numpy(img)
+    return (255 - img).astype(np.uint8)
+
+
+def prim_shift_hsv(img, h=0, s=1, v=1):
+    """Shift HSV: h=degrees offset, s/v=multipliers."""
+    img = _to_numpy(img)
+    hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV).astype(np.float32)
+    hsv[:, :, 0] = (hsv[:, :, 0] + h / 2) % 180
+    hsv[:, :, 1] = np.clip(hsv[:, :, 1] * s, 0, 255)
+    hsv[:, :, 2] = np.clip(hsv[:, :, 2] * v, 0, 255)
+    return cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2RGB)
+
+
+def prim_add_noise(img, amount):
+    """Add gaussian noise to image."""
+    img = _to_numpy(img)
+    noise = np.random.normal(0, amount, img.shape)
+    result = img.astype(np.float32) + noise
+    return np.clip(result, 0, 255).astype(np.uint8)
+
+
+def prim_quantize(img, levels):
+    """Reduce to N color levels per channel."""
+    img = _to_numpy(img)
+    levels = max(2, int(levels))
+    factor = 256 / levels
+    result = (img // factor) * factor + factor // 2
+    return np.clip(result, 0, 255).astype(np.uint8)
+
+
+def prim_sepia(img, intensity=1.0):
+    """Apply sepia tone effect."""
+    img = _to_numpy(img)
+    sepia_matrix = np.array([
+        [0.393, 0.769, 0.189],
+        [0.349, 0.686, 0.168],
+        [0.272, 0.534, 0.131]
+    ])
+    sepia = np.dot(img, sepia_matrix.T)
+    result = img.astype(np.float32) * (1 - intensity) + sepia * intensity
+    return np.clip(result, 0, 255).astype(np.uint8)
+
+
+def prim_grayscale(img):
+    """Convert to grayscale (still RGB output)."""
+    img = _to_numpy(img)
+    gray = 0.299 * img[:, :, 0] + 0.587 * img[:, :, 1] + 0.114 * img[:, :, 2]
+    return np.stack([gray, gray, gray], axis=-1).astype(np.uint8)
+
+
+PRIMITIVES = {
+    # Brightness/Contrast
+    'adjust': prim_adjust,
+
+    # Saturation
+    'mix-gray': prim_mix_gray,
+    'grayscale': prim_grayscale,
+
+    # HSV manipulation
+    'shift-hsv': prim_shift_hsv,
+
+    # Inversion
+    'invert-img': prim_invert_img,
+
+    # Effects
+    'add-noise': prim_add_noise,
+    'quantize': prim_quantize,
+    'sepia': prim_sepia,
+}
--- a/l1/sexp_effects/primitive_libs/color_ops_gpu.py
+++ b/l1/sexp_effects/primitive_libs/color_ops_gpu.py
@@ -0,0 +1,280 @@
+"""
+GPU-Accelerated Color Operations Library
+
+Uses CuPy for CUDA-accelerated color transforms.
+
+Performance Mode:
+- Set STREAMING_GPU_PERSIST=1 to keep frames on GPU between operations
+- This dramatically improves performance by avoiding CPU<->GPU transfers
+"""
+import os
+import numpy as np
+
+# Try to import CuPy for GPU acceleration
+try:
+    import cupy as cp
+    GPU_AVAILABLE = True
+    print("[color_ops_gpu] CuPy GPU acceleration enabled")
+except ImportError:
+    cp = np
+    GPU_AVAILABLE = False
+    print("[color_ops_gpu] CuPy not available, using CPU fallback")
+
+# GPU persistence mode - keep frames on GPU between operations
+GPU_PERSIST = os.environ.get("STREAMING_GPU_PERSIST", "0") == "1"
+if GPU_AVAILABLE and GPU_PERSIST:
+    print("[color_ops_gpu] GPU persistence enabled - frames stay on GPU")
+
+
+def _to_gpu(img):
+    """Move image to GPU if available."""
+    if GPU_AVAILABLE and not isinstance(img, cp.ndarray):
+        return cp.asarray(img)
+    return img
+
+
+def _to_cpu(img):
+    """Move image back to CPU (only if GPU_PERSIST is disabled)."""
+    if not GPU_PERSIST and GPU_AVAILABLE and isinstance(img, cp.ndarray):
+        return cp.asnumpy(img)
+    return img
+
+
+def prim_invert(img):
+    """Invert image colors."""
+    if GPU_AVAILABLE:
+        img_gpu = _to_gpu(img)
+        return _to_cpu(255 - img_gpu)
+    return 255 - img
+
+
+def prim_grayscale(img):
+    """Convert to grayscale."""
+    if img.ndim != 3:
+        return img
+
+    if GPU_AVAILABLE:
+        img_gpu = _to_gpu(img.astype(np.float32))
+        # Standard luminance weights
+        gray = 0.299 * img_gpu[:, :, 0] + 0.587 * img_gpu[:, :, 1] + 0.114 * img_gpu[:, :, 2]
+        gray = cp.clip(gray, 0, 255).astype(cp.uint8)
+        # Stack to 3 channels
+        result = cp.stack([gray, gray, gray], axis=2)
+        return _to_cpu(result)
+
+    gray = 0.299 * img[:, :, 0] + 0.587 * img[:, :, 1] + 0.114 * img[:, :, 2]
+    gray = np.clip(gray, 0, 255).astype(np.uint8)
+    return np.stack([gray, gray, gray], axis=2)
+
+
+def prim_brightness(img, factor=1.0):
+    """Adjust brightness by factor."""
+    xp = cp if GPU_AVAILABLE else np
+    if GPU_AVAILABLE:
+        img_gpu = _to_gpu(img.astype(np.float32))
+        result = xp.clip(img_gpu * factor, 0, 255).astype(xp.uint8)
+        return _to_cpu(result)
+    return np.clip(img.astype(np.float32) * factor, 0, 255).astype(np.uint8)
+
+
+def prim_contrast(img, factor=1.0):
+    """Adjust contrast around midpoint."""
+    xp = cp if GPU_AVAILABLE else np
+    if GPU_AVAILABLE:
+        img_gpu = _to_gpu(img.astype(np.float32))
+        result = xp.clip((img_gpu - 128) * factor + 128, 0, 255).astype(xp.uint8)
+        return _to_cpu(result)
+    return np.clip((img.astype(np.float32) - 128) * factor + 128, 0, 255).astype(np.uint8)
+
+
+# CUDA kernel for HSV hue shift
+if GPU_AVAILABLE:
+    _hue_shift_kernel = cp.RawKernel(r'''
+    extern "C" __global__
+    void hue_shift(unsigned char* img, int width, int height, float shift) {
+        int x = blockDim.x * blockIdx.x + threadIdx.x;
+        int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x >= width || y >= height) return;
+
+        int idx = (y * width + x) * 3;
+
+        // Get RGB
+        float r = img[idx] / 255.0f;
+        float g = img[idx + 1] / 255.0f;
+        float b = img[idx + 2] / 255.0f;
+
+        // RGB to HSV
+        float max_c = fmaxf(r, fmaxf(g, b));
+        float min_c = fminf(r, fminf(g, b));
+        float delta = max_c - min_c;
+
+        float h = 0.0f, s = 0.0f, v = max_c;
+
+        if (delta > 0.00001f) {
+            s = delta / max_c;
+
+            if (max_c == r) {
+                h = 60.0f * fmodf((g - b) / delta, 6.0f);
+            } else if (max_c == g) {
+                h = 60.0f * ((b - r) / delta + 2.0f);
+            } else {
+                h = 60.0f * ((r - g) / delta + 4.0f);
+            }
+
+            if (h < 0) h += 360.0f;
+        }
+
+        // Shift hue
+        h = fmodf(h + shift, 360.0f);
+        if (h < 0) h += 360.0f;
+
+        // HSV to RGB
+        float c = v * s;
+        float x_val = c * (1.0f - fabsf(fmodf(h / 60.0f, 2.0f) - 1.0f));
+        float m = v - c;
+
+        float r_out, g_out, b_out;
+        if (h < 60) {
+            r_out = c; g_out = x_val; b_out = 0;
+        } else if (h < 120) {
+            r_out = x_val; g_out = c; b_out = 0;
+        } else if (h < 180) {
+            r_out = 0; g_out = c; b_out = x_val;
+        } else if (h < 240) {
+            r_out = 0; g_out = x_val; b_out = c;
+        } else if (h < 300) {
+            r_out = x_val; g_out = 0; b_out = c;
+        } else {
+            r_out = c; g_out = 0; b_out = x_val;
+        }
+
+        img[idx] = (unsigned char)fminf(255.0f, (r_out + m) * 255.0f);
+        img[idx + 1] = (unsigned char)fminf(255.0f, (g_out + m) * 255.0f);
+        img[idx + 2] = (unsigned char)fminf(255.0f, (b_out + m) * 255.0f);
+    }
+    ''', 'hue_shift')
+
+
+def prim_hue_shift(img, shift=0.0):
+    """Shift hue by degrees."""
+    if img.ndim != 3 or img.shape[2] != 3:
+        return img
+
+    if not GPU_AVAILABLE:
+        import cv2
+        hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
+        hsv[:, :, 0] = (hsv[:, :, 0].astype(np.float32) + shift / 2) % 180
+        return cv2.cvtColor(hsv, cv2.COLOR_HSV2RGB)
+
+    h, w = img.shape[:2]
+    img_gpu = _to_gpu(img.astype(np.uint8)).copy()
+
+    block = (16, 16)
+    grid = ((w + block[0] - 1) // block[0], (h + block[1] - 1) // block[1])
+
+    _hue_shift_kernel(grid, block, (img_gpu, np.int32(w), np.int32(h), np.float32(shift)))
+
+    return _to_cpu(img_gpu)
+
+
+def prim_saturate(img, factor=1.0):
+    """Adjust saturation by factor."""
+    if img.ndim != 3:
+        return img
+
+    if not GPU_AVAILABLE:
+        import cv2
+        hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV).astype(np.float32)
+        hsv[:, :, 1] = np.clip(hsv[:, :, 1] * factor, 0, 255)
+        return cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2RGB)
+
+    # GPU version - simple desaturation blend
+    img_gpu = _to_gpu(img.astype(np.float32))
+    gray = 0.299 * img_gpu[:, :, 0] + 0.587 * img_gpu[:, :, 1] + 0.114 * img_gpu[:, :, 2]
+    gray = gray[:, :, cp.newaxis]
+
+    if factor < 1.0:
+        # Desaturate: blend toward gray
+        result = img_gpu * factor + gray * (1 - factor)
+    else:
+        # Oversaturate: extrapolate away from gray
+        result = gray + (img_gpu - gray) * factor
+
+    result = cp.clip(result, 0, 255).astype(cp.uint8)
+    return _to_cpu(result)
+
+
+def prim_blend(img1, img2, alpha=0.5):
+    """Blend two images with alpha."""
+    xp = cp if GPU_AVAILABLE else np
+
+    if GPU_AVAILABLE:
+        img1_gpu = _to_gpu(img1.astype(np.float32))
+        img2_gpu = _to_gpu(img2.astype(np.float32))
+        result = img1_gpu * (1 - alpha) + img2_gpu * alpha
+        result = xp.clip(result, 0, 255).astype(xp.uint8)
+        return _to_cpu(result)
+
+    result = img1.astype(np.float32) * (1 - alpha) + img2.astype(np.float32) * alpha
+    return np.clip(result, 0, 255).astype(np.uint8)
+
+
+def prim_add(img1, img2):
+    """Add two images (clamped)."""
+    xp = cp if GPU_AVAILABLE else np
+    if GPU_AVAILABLE:
+        result = xp.clip(_to_gpu(img1).astype(np.int16) + _to_gpu(img2).astype(np.int16), 0, 255)
+        return _to_cpu(result.astype(xp.uint8))
+    return np.clip(img1.astype(np.int16) + img2.astype(np.int16), 0, 255).astype(np.uint8)
+
+
+def prim_multiply(img1, img2):
+    """Multiply two images (normalized)."""
+    xp = cp if GPU_AVAILABLE else np
+    if GPU_AVAILABLE:
+        result = (_to_gpu(img1).astype(np.float32) * _to_gpu(img2).astype(np.float32)) / 255.0
+        result = xp.clip(result, 0, 255).astype(xp.uint8)
+        return _to_cpu(result)
+    result = (img1.astype(np.float32) * img2.astype(np.float32)) / 255.0
+    return np.clip(result, 0, 255).astype(np.uint8)
+
+
+def prim_screen(img1, img2):
+    """Screen blend mode."""
+    xp = cp if GPU_AVAILABLE else np
+    if GPU_AVAILABLE:
+        i1 = _to_gpu(img1).astype(np.float32) / 255.0
+        i2 = _to_gpu(img2).astype(np.float32) / 255.0
+        result = 1.0 - (1.0 - i1) * (1.0 - i2)
+        result = xp.clip(result * 255, 0, 255).astype(xp.uint8)
+        return _to_cpu(result)
+    i1 = img1.astype(np.float32) / 255.0
+    i2 = img2.astype(np.float32) / 255.0
+    result = 1.0 - (1.0 - i1) * (1.0 - i2)
+    return np.clip(result * 255, 0, 255).astype(np.uint8)
+
+
+# Import CPU primitives as fallbacks
+def _get_cpu_primitives():
+    """Get all primitives from CPU color_ops module as fallbacks."""
+    from sexp_effects.primitive_libs import color_ops
+    return color_ops.PRIMITIVES
+
+
+# Export functions - start with CPU primitives, then override with GPU versions
+PRIMITIVES = _get_cpu_primitives().copy()
+
+# Override specific primitives with GPU-accelerated versions
+PRIMITIVES.update({
+    'invert': prim_invert,
+    'grayscale': prim_grayscale,
+    'brightness': prim_brightness,
+    'contrast': prim_contrast,
+    'hue-shift': prim_hue_shift,
+    'saturate': prim_saturate,
+    'blend': prim_blend,
+    'add': prim_add,
+    'multiply': prim_multiply,
+    'screen': prim_screen,
+})
--- a/l1/sexp_effects/primitive_libs/core.py
+++ b/l1/sexp_effects/primitive_libs/core.py
@@ -0,0 +1,294 @@
+"""
+Core Primitives - Always available, minimal essential set.
+
+These are the primitives that form the foundation of the language.
+They cannot be overridden by libraries.
+"""
+
+
+# Arithmetic
+def prim_add(*args):
+    if len(args) == 0:
+        return 0
+    result = args[0]
+    for arg in args[1:]:
+        result = result + arg
+    return result
+
+
+def prim_sub(a, b=None):
+    if b is None:
+        return -a
+    return a - b
+
+
+def prim_mul(*args):
+    if len(args) == 0:
+        return 1
+    result = args[0]
+    for arg in args[1:]:
+        result = result * arg
+    return result
+
+
+def prim_div(a, b):
+    return a / b
+
+
+def prim_mod(a, b):
+    return a % b
+
+
+def prim_abs(x):
+    return abs(x)
+
+
+def prim_min(*args):
+    return min(args)
+
+
+def prim_max(*args):
+    return max(args)
+
+
+def prim_round(x):
+    import numpy as np
+    if hasattr(x, '_data'):  # Xector
+        from .xector import Xector
+        return Xector(np.round(x._data), x._shape)
+    if isinstance(x, np.ndarray):
+        return np.round(x)
+    return round(x)
+
+
+def prim_floor(x):
+    import numpy as np
+    if hasattr(x, '_data'):  # Xector
+        from .xector import Xector
+        return Xector(np.floor(x._data), x._shape)
+    if isinstance(x, np.ndarray):
+        return np.floor(x)
+    import math
+    return math.floor(x)
+
+
+def prim_ceil(x):
+    import numpy as np
+    if hasattr(x, '_data'):  # Xector
+        from .xector import Xector
+        return Xector(np.ceil(x._data), x._shape)
+    if isinstance(x, np.ndarray):
+        return np.ceil(x)
+    import math
+    return math.ceil(x)
+
+
+# Comparison
+def prim_lt(a, b):
+    return a < b
+
+
+def prim_gt(a, b):
+    return a > b
+
+
+def prim_le(a, b):
+    return a <= b
+
+
+def prim_ge(a, b):
+    return a >= b
+
+
+def prim_eq(a, b):
+    if isinstance(a, float) or isinstance(b, float):
+        return abs(a - b) < 1e-9
+    return a == b
+
+
+def prim_ne(a, b):
+    return not prim_eq(a, b)
+
+
+# Logic
+def prim_not(x):
+    return not x
+
+
+def prim_and(*args):
+    for a in args:
+        if not a:
+            return False
+    return True
+
+
+def prim_or(*args):
+    for a in args:
+        if a:
+            return True
+    return False
+
+
+# Basic data access
+def prim_get(obj, key, default=None):
+    """Get value from dict or list."""
+    if isinstance(obj, dict):
+        return obj.get(key, default)
+    elif isinstance(obj, (list, tuple)):
+        try:
+            return obj[int(key)]
+        except (IndexError, ValueError):
+            return default
+    return default
+
+
+def prim_nth(seq, i):
+    i = int(i)
+    if 0 <= i < len(seq):
+        return seq[i]
+    return None
+
+
+def prim_first(seq):
+    return seq[0] if seq else None
+
+
+def prim_length(seq):
+    return len(seq)
+
+
+def prim_list(*args):
+    return list(args)
+
+
+# Type checking
+def prim_is_number(x):
+    return isinstance(x, (int, float))
+
+
+def prim_is_string(x):
+    return isinstance(x, str)
+
+
+def prim_is_list(x):
+    return isinstance(x, (list, tuple))
+
+
+def prim_is_dict(x):
+    return isinstance(x, dict)
+
+
+def prim_is_nil(x):
+    return x is None
+
+
+# Higher-order / iteration
+def prim_reduce(seq, init, fn):
+    """(reduce seq init fn) — fold left: fn(fn(fn(init, s0), s1), s2) ..."""
+    acc = init
+    for item in seq:
+        acc = fn(acc, item)
+    return acc
+
+
+def prim_map(seq, fn):
+    """(map seq fn) — apply fn to each element, return new list."""
+    return [fn(item) for item in seq]
+
+
+def prim_range(*args):
+    """(range end), (range start end), or (range start end step) — integer range."""
+    if len(args) == 1:
+        return list(range(int(args[0])))
+    elif len(args) == 2:
+        return list(range(int(args[0]), int(args[1])))
+    elif len(args) >= 3:
+        return list(range(int(args[0]), int(args[1]), int(args[2])))
+    return []
+
+
+# Random
+import random
+_rng = random.Random()
+
+def set_random_seed(seed):
+    """Set the random seed for deterministic output."""
+    global _rng
+    _rng = random.Random(seed)
+
+def prim_rand():
+    """Return random float in [0, 1)."""
+    return _rng.random()
+
+def prim_rand_int(lo, hi):
+    """Return random integer in [lo, hi]."""
+    return _rng.randint(int(lo), int(hi))
+
+def prim_rand_range(lo, hi):
+    """Return random float in [lo, hi)."""
+    return lo + _rng.random() * (hi - lo)
+
+def prim_map_range(val, from_lo, from_hi, to_lo, to_hi):
+    """Map value from one range to another."""
+    if from_hi == from_lo:
+        return to_lo
+    t = (val - from_lo) / (from_hi - from_lo)
+    return to_lo + t * (to_hi - to_lo)
+
+
+# Core primitives dict
+PRIMITIVES = {
+    # Arithmetic
+    '+': prim_add,
+    '-': prim_sub,
+    '*': prim_mul,
+    '/': prim_div,
+    'mod': prim_mod,
+    'abs': prim_abs,
+    'min': prim_min,
+    'max': prim_max,
+    'round': prim_round,
+    'floor': prim_floor,
+    'ceil': prim_ceil,
+
+    # Comparison
+    '<': prim_lt,
+    '>': prim_gt,
+    '<=': prim_le,
+    '>=': prim_ge,
+    '=': prim_eq,
+    '!=': prim_ne,
+
+    # Logic
+    'not': prim_not,
+    'and': prim_and,
+    'or': prim_or,
+
+    # Data access
+    'get': prim_get,
+    'nth': prim_nth,
+    'first': prim_first,
+    'length': prim_length,
+    'len': prim_length,
+    'list': prim_list,
+
+    # Type predicates
+    'number?': prim_is_number,
+    'string?': prim_is_string,
+    'list?': prim_is_list,
+    'dict?': prim_is_dict,
+    'nil?': prim_is_nil,
+    'is-nil': prim_is_nil,
+
+    # Higher-order / iteration
+    'reduce': prim_reduce,
+    'fold': prim_reduce,
+    'map': prim_map,
+    'range': prim_range,
+
+    # Random
+    'rand': prim_rand,
+    'rand-int': prim_rand_int,
+    'rand-range': prim_rand_range,
+    'map-range': prim_map_range,
+}
--- a/l1/sexp_effects/primitive_libs/drawing.py
+++ b/l1/sexp_effects/primitive_libs/drawing.py
@@ -0,0 +1,690 @@
+"""
+Drawing Primitives Library
+
+Draw shapes, text, and characters on images with sophisticated text handling.
+
+Text Features:
+- Font loading from files or system fonts
+- Text measurement and fitting
+- Alignment (left/center/right, top/middle/bottom)
+- Opacity for fade effects
+- Multi-line text support
+- Shadow and outline effects
+"""
+import numpy as np
+import cv2
+from PIL import Image, ImageDraw, ImageFont
+import os
+import glob as glob_module
+from typing import Optional, Tuple, List, Union
+
+
+# =============================================================================
+# Font Management
+# =============================================================================
+
+# Font cache: (path, size) -> font object
+_font_cache = {}
+
+# Common system font directories
+FONT_DIRS = [
+    "/usr/share/fonts",
+    "/usr/local/share/fonts",
+    "~/.fonts",
+    "~/.local/share/fonts",
+    "/System/Library/Fonts",  # macOS
+    "/Library/Fonts",  # macOS
+    "C:/Windows/Fonts",  # Windows
+]
+
+# Default fonts to try (in order of preference)
+DEFAULT_FONTS = [
+    "DejaVuSans.ttf",
+    "DejaVuSansMono.ttf",
+    "Arial.ttf",
+    "Helvetica.ttf",
+    "FreeSans.ttf",
+    "LiberationSans-Regular.ttf",
+]
+
+
+def _find_font_file(name: str) -> Optional[str]:
+    """Find a font file by name in system directories."""
+    # If it's already a full path
+    if os.path.isfile(name):
+        return name
+
+    # Expand user paths
+    expanded = os.path.expanduser(name)
+    if os.path.isfile(expanded):
+        return expanded
+
+    # Search in font directories
+    for font_dir in FONT_DIRS:
+        font_dir = os.path.expanduser(font_dir)
+        if not os.path.isdir(font_dir):
+            continue
+
+        # Direct match
+        direct = os.path.join(font_dir, name)
+        if os.path.isfile(direct):
+            return direct
+
+        # Recursive search
+        for root, dirs, files in os.walk(font_dir):
+            for f in files:
+                if f.lower() == name.lower():
+                    return os.path.join(root, f)
+                # Also match without extension
+                base = os.path.splitext(f)[0]
+                if base.lower() == name.lower():
+                    return os.path.join(root, f)
+
+    return None
+
+
+def _get_default_font(size: int = 24) -> ImageFont.FreeTypeFont:
+    """Get a default font at the given size."""
+    for font_name in DEFAULT_FONTS:
+        path = _find_font_file(font_name)
+        if path:
+            try:
+                return ImageFont.truetype(path, size)
+            except:
+                continue
+
+    # Last resort: PIL default
+    return ImageFont.load_default()
+
+
+def prim_make_font(name_or_path: str, size: int = 24) -> ImageFont.FreeTypeFont:
+    """
+    Load a font by name or path.
+
+    (make-font "Arial" 32)           ; system font by name
+    (make-font "/path/to/font.ttf" 24)  ; font file path
+    (make-font "DejaVuSans" 48)      ; searches common locations
+
+    Returns a font object for use with text primitives.
+    """
+    size = int(size)
+
+    # Check cache
+    cache_key = (name_or_path, size)
+    if cache_key in _font_cache:
+        return _font_cache[cache_key]
+
+    # Find the font file
+    path = _find_font_file(name_or_path)
+    if not path:
+        raise FileNotFoundError(f"Font not found: {name_or_path}")
+
+    # Load and cache
+    font = ImageFont.truetype(path, size)
+    _font_cache[cache_key] = font
+    return font
+
+
+def prim_list_fonts() -> List[str]:
+    """
+    List available system fonts.
+
+    (list-fonts)  ; -> ("Arial.ttf" "DejaVuSans.ttf" ...)
+
+    Returns list of font filenames found in system directories.
+    """
+    fonts = set()
+
+    for font_dir in FONT_DIRS:
+        font_dir = os.path.expanduser(font_dir)
+        if not os.path.isdir(font_dir):
+            continue
+
+        for root, dirs, files in os.walk(font_dir):
+            for f in files:
+                if f.lower().endswith(('.ttf', '.otf', '.ttc')):
+                    fonts.add(f)
+
+    return sorted(fonts)
+
+
+def prim_font_size(font: ImageFont.FreeTypeFont) -> int:
+    """
+    Get the size of a font.
+
+    (font-size my-font)  ; -> 24
+    """
+    return font.size
+
+
+# =============================================================================
+# Text Measurement
+# =============================================================================
+
+def prim_text_size(text: str, font=None, font_size: int = 24) -> Tuple[int, int]:
+    """
+    Measure text dimensions.
+
+    (text-size "Hello" my-font)      ; -> (width height)
+    (text-size "Hello" :font-size 32) ; -> (width height) with default font
+
+    For multi-line text, returns total bounding box.
+    """
+    if font is None:
+        font = _get_default_font(int(font_size))
+    elif isinstance(font, (int, float)):
+        font = _get_default_font(int(font))
+
+    # Create temporary image for measurement
+    img = Image.new('RGB', (1, 1))
+    draw = ImageDraw.Draw(img)
+
+    bbox = draw.textbbox((0, 0), str(text), font=font)
+    width = bbox[2] - bbox[0]
+    height = bbox[3] - bbox[1]
+
+    return (width, height)
+
+
+def prim_text_metrics(font=None, font_size: int = 24) -> dict:
+    """
+    Get font metrics.
+
+    (text-metrics my-font)  ; -> {ascent: 20, descent: 5, height: 25}
+
+    Useful for precise text layout.
+    """
+    if font is None:
+        font = _get_default_font(int(font_size))
+    elif isinstance(font, (int, float)):
+        font = _get_default_font(int(font))
+
+    ascent, descent = font.getmetrics()
+    return {
+        'ascent': ascent,
+        'descent': descent,
+        'height': ascent + descent,
+        'size': font.size,
+    }
+
+
+def prim_fit_text_size(text: str, max_width: int, max_height: int,
+                       font_name: str = None, min_size: int = 8,
+                       max_size: int = 500) -> int:
+    """
+    Calculate font size to fit text within bounds.
+
+    (fit-text-size "Hello World" 400 100)  ; -> 48
+    (fit-text-size "Title" 800 200 :font-name "Arial")
+
+    Returns the largest font size that fits within max_width x max_height.
+    """
+    max_width = int(max_width)
+    max_height = int(max_height)
+    min_size = int(min_size)
+    max_size = int(max_size)
+    text = str(text)
+
+    # Binary search for optimal size
+    best_size = min_size
+    low, high = min_size, max_size
+
+    while low <= high:
+        mid = (low + high) // 2
+
+        if font_name:
+            try:
+                font = prim_make_font(font_name, mid)
+            except:
+                font = _get_default_font(mid)
+        else:
+            font = _get_default_font(mid)
+
+        w, h = prim_text_size(text, font)
+
+        if w <= max_width and h <= max_height:
+            best_size = mid
+            low = mid + 1
+        else:
+            high = mid - 1
+
+    return best_size
+
+
+def prim_fit_font(text: str, max_width: int, max_height: int,
+                  font_name: str = None, min_size: int = 8,
+                  max_size: int = 500) -> ImageFont.FreeTypeFont:
+    """
+    Create a font sized to fit text within bounds.
+
+    (fit-font "Hello World" 400 100)  ; -> font object
+    (fit-font "Title" 800 200 :font-name "Arial")
+
+    Returns a font object at the optimal size.
+    """
+    size = prim_fit_text_size(text, max_width, max_height,
+                               font_name, min_size, max_size)
+
+    if font_name:
+        try:
+            return prim_make_font(font_name, size)
+        except:
+            pass
+
+    return _get_default_font(size)
+
+
+# =============================================================================
+# Text Drawing
+# =============================================================================
+
+def prim_text(img: np.ndarray, text: str,
+              x: int = None, y: int = None,
+              width: int = None, height: int = None,
+              font=None, font_size: int = 24, font_name: str = None,
+              color=None, opacity: float = 1.0,
+              align: str = "left", valign: str = "top",
+              fit: bool = False,
+              shadow: bool = False, shadow_color=None, shadow_offset: int = 2,
+              outline: bool = False, outline_color=None, outline_width: int = 1,
+              line_spacing: float = 1.2) -> np.ndarray:
+    """
+    Draw text with alignment, opacity, and effects.
+
+    Basic usage:
+        (text frame "Hello" :x 100 :y 50)
+
+    Centered in frame:
+        (text frame "Title" :align "center" :valign "middle")
+
+    Fit to box:
+        (text frame "Big Text" :x 50 :y 50 :width 400 :height 100 :fit true)
+
+    With fade (for animations):
+        (text frame "Fading" :x 100 :y 100 :opacity 0.5)
+
+    With effects:
+        (text frame "Shadow" :x 100 :y 100 :shadow true)
+        (text frame "Outline" :x 100 :y 100 :outline true :outline-color (0 0 0))
+
+    Args:
+        img: Input frame
+        text: Text to draw
+        x, y: Position (if not specified, uses alignment in full frame)
+        width, height: Bounding box (for fit and alignment within box)
+        font: Font object from make-font
+        font_size: Size if no font specified
+        font_name: Font name to load
+        color: RGB tuple (default white)
+        opacity: 0.0 (invisible) to 1.0 (opaque) for fading
+        align: "left", "center", "right"
+        valign: "top", "middle", "bottom"
+        fit: If true, auto-size font to fit in box
+        shadow: Draw drop shadow
+        shadow_color: Shadow color (default black)
+        shadow_offset: Shadow offset in pixels
+        outline: Draw text outline
+        outline_color: Outline color (default black)
+        outline_width: Outline thickness
+        line_spacing: Multiplier for line height (for multi-line)
+
+    Returns:
+        Frame with text drawn
+    """
+    h, w = img.shape[:2]
+    text = str(text)
+
+    # Default colors
+    if color is None:
+        color = (255, 255, 255)
+    else:
+        color = tuple(int(c) for c in color)
+
+    if shadow_color is None:
+        shadow_color = (0, 0, 0)
+    else:
+        shadow_color = tuple(int(c) for c in shadow_color)
+
+    if outline_color is None:
+        outline_color = (0, 0, 0)
+    else:
+        outline_color = tuple(int(c) for c in outline_color)
+
+    # Determine bounding box
+    if x is None:
+        x = 0
+        if width is None:
+            width = w
+    if y is None:
+        y = 0
+        if height is None:
+            height = h
+
+    x, y = int(x), int(y)
+    box_width = int(width) if width else w - x
+    box_height = int(height) if height else h - y
+
+    # Get or create font
+    if font is None:
+        if fit:
+            font = prim_fit_font(text, box_width, box_height, font_name)
+        elif font_name:
+            try:
+                font = prim_make_font(font_name, int(font_size))
+            except:
+                font = _get_default_font(int(font_size))
+        else:
+            font = _get_default_font(int(font_size))
+
+    # Measure text
+    text_w, text_h = prim_text_size(text, font)
+
+    # Calculate position based on alignment
+    if align == "center":
+        draw_x = x + (box_width - text_w) // 2
+    elif align == "right":
+        draw_x = x + box_width - text_w
+    else:  # left
+        draw_x = x
+
+    if valign == "middle":
+        draw_y = y + (box_height - text_h) // 2
+    elif valign == "bottom":
+        draw_y = y + box_height - text_h
+    else:  # top
+        draw_y = y
+
+    # Create RGBA image for compositing with opacity
+    pil_img = Image.fromarray(img).convert('RGBA')
+
+    # Create text layer with transparency
+    text_layer = Image.new('RGBA', (w, h), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(text_layer)
+
+    # Draw shadow first (if enabled)
+    if shadow:
+        shadow_x = draw_x + shadow_offset
+        shadow_y = draw_y + shadow_offset
+        shadow_rgba = shadow_color + (int(255 * opacity * 0.5),)
+        draw.text((shadow_x, shadow_y), text, fill=shadow_rgba, font=font)
+
+    # Draw outline (if enabled)
+    if outline:
+        outline_rgba = outline_color + (int(255 * opacity),)
+        ow = int(outline_width)
+        for dx in range(-ow, ow + 1):
+            for dy in range(-ow, ow + 1):
+                if dx != 0 or dy != 0:
+                    draw.text((draw_x + dx, draw_y + dy), text,
+                             fill=outline_rgba, font=font)
+
+    # Draw main text
+    text_rgba = color + (int(255 * opacity),)
+    draw.text((draw_x, draw_y), text, fill=text_rgba, font=font)
+
+    # Composite
+    result = Image.alpha_composite(pil_img, text_layer)
+    return np.array(result.convert('RGB'))
+
+
+def prim_text_box(img: np.ndarray, text: str,
+                  x: int, y: int, width: int, height: int,
+                  font=None, font_size: int = 24, font_name: str = None,
+                  color=None, opacity: float = 1.0,
+                  align: str = "center", valign: str = "middle",
+                  fit: bool = True,
+                  padding: int = 0,
+                  background=None, background_opacity: float = 0.5,
+                  **kwargs) -> np.ndarray:
+    """
+    Draw text fitted within a box, optionally with background.
+
+    (text-box frame "Title" 50 50 400 100)
+    (text-box frame "Subtitle" 50 160 400 50
+              :background (0 0 0) :background-opacity 0.7)
+
+    Convenience wrapper around text() for common box-with-text pattern.
+    """
+    x, y = int(x), int(y)
+    width, height = int(width), int(height)
+    padding = int(padding)
+
+    result = img.copy()
+
+    # Draw background if specified
+    if background is not None:
+        bg_color = tuple(int(c) for c in background)
+
+        # Create background with opacity
+        pil_img = Image.fromarray(result).convert('RGBA')
+        bg_layer = Image.new('RGBA', (pil_img.width, pil_img.height), (0, 0, 0, 0))
+        bg_draw = ImageDraw.Draw(bg_layer)
+        bg_rgba = bg_color + (int(255 * background_opacity),)
+        bg_draw.rectangle([x, y, x + width, y + height], fill=bg_rgba)
+        result = np.array(Image.alpha_composite(pil_img, bg_layer).convert('RGB'))
+
+    # Draw text within padded box
+    return prim_text(result, text,
+                     x=x + padding, y=y + padding,
+                     width=width - 2 * padding, height=height - 2 * padding,
+                     font=font, font_size=font_size, font_name=font_name,
+                     color=color, opacity=opacity,
+                     align=align, valign=valign, fit=fit,
+                     **kwargs)
+
+
+# =============================================================================
+# Legacy text functions (keep for compatibility)
+# =============================================================================
+
+def prim_draw_char(img, char, x, y, font_size=16, color=None):
+    """Draw a single character at (x, y). Legacy function."""
+    return prim_text(img, str(char), x=int(x), y=int(y),
+                    font_size=int(font_size), color=color)
+
+
+def prim_draw_text(img, text, x, y, font_size=16, color=None):
+    """Draw text string at (x, y). Legacy function."""
+    return prim_text(img, str(text), x=int(x), y=int(y),
+                    font_size=int(font_size), color=color)
+
+
+# =============================================================================
+# Shape Drawing
+# =============================================================================
+
+def prim_fill_rect(img, x, y, w, h, color=None, opacity: float = 1.0):
+    """
+    Fill a rectangle with color.
+
+    (fill-rect frame 10 10 100 50 (255 0 0))
+    (fill-rect frame 10 10 100 50 (255 0 0) :opacity 0.5)
+    """
+    if color is None:
+        color = [255, 255, 255]
+
+    x, y, w, h = int(x), int(y), int(w), int(h)
+
+    if opacity >= 1.0:
+        result = img.copy()
+        result[y:y+h, x:x+w] = color
+        return result
+
+    # With opacity, use alpha compositing
+    pil_img = Image.fromarray(img).convert('RGBA')
+    layer = Image.new('RGBA', (pil_img.width, pil_img.height), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(layer)
+    fill_rgba = tuple(int(c) for c in color) + (int(255 * opacity),)
+    draw.rectangle([x, y, x + w, y + h], fill=fill_rgba)
+    result = Image.alpha_composite(pil_img, layer)
+    return np.array(result.convert('RGB'))
+
+
+def prim_draw_rect(img, x, y, w, h, color=None, thickness=1, opacity: float = 1.0):
+    """Draw rectangle outline."""
+    if color is None:
+        color = [255, 255, 255]
+
+    if opacity >= 1.0:
+        result = img.copy()
+        cv2.rectangle(result, (int(x), int(y)), (int(x+w), int(y+h)),
+                      tuple(int(c) for c in color), int(thickness))
+        return result
+
+    # With opacity
+    pil_img = Image.fromarray(img).convert('RGBA')
+    layer = Image.new('RGBA', (pil_img.width, pil_img.height), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(layer)
+    outline_rgba = tuple(int(c) for c in color) + (int(255 * opacity),)
+    draw.rectangle([int(x), int(y), int(x+w), int(y+h)],
+                   outline=outline_rgba, width=int(thickness))
+    result = Image.alpha_composite(pil_img, layer)
+    return np.array(result.convert('RGB'))
+
+
+def prim_draw_line(img, x1, y1, x2, y2, color=None, thickness=1, opacity: float = 1.0):
+    """Draw a line from (x1, y1) to (x2, y2)."""
+    if color is None:
+        color = [255, 255, 255]
+
+    if opacity >= 1.0:
+        result = img.copy()
+        cv2.line(result, (int(x1), int(y1)), (int(x2), int(y2)),
+                 tuple(int(c) for c in color), int(thickness))
+        return result
+
+    # With opacity
+    pil_img = Image.fromarray(img).convert('RGBA')
+    layer = Image.new('RGBA', (pil_img.width, pil_img.height), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(layer)
+    line_rgba = tuple(int(c) for c in color) + (int(255 * opacity),)
+    draw.line([(int(x1), int(y1)), (int(x2), int(y2))],
+              fill=line_rgba, width=int(thickness))
+    result = Image.alpha_composite(pil_img, layer)
+    return np.array(result.convert('RGB'))
+
+
+def prim_draw_circle(img, cx, cy, radius, color=None, thickness=1,
+                     fill=False, opacity: float = 1.0):
+    """Draw a circle."""
+    if color is None:
+        color = [255, 255, 255]
+
+    if opacity >= 1.0:
+        result = img.copy()
+        t = -1 if fill else int(thickness)
+        cv2.circle(result, (int(cx), int(cy)), int(radius),
+                   tuple(int(c) for c in color), t)
+        return result
+
+    # With opacity
+    pil_img = Image.fromarray(img).convert('RGBA')
+    layer = Image.new('RGBA', (pil_img.width, pil_img.height), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(layer)
+    cx, cy, r = int(cx), int(cy), int(radius)
+    rgba = tuple(int(c) for c in color) + (int(255 * opacity),)
+
+    if fill:
+        draw.ellipse([cx - r, cy - r, cx + r, cy + r], fill=rgba)
+    else:
+        draw.ellipse([cx - r, cy - r, cx + r, cy + r],
+                     outline=rgba, width=int(thickness))
+
+    result = Image.alpha_composite(pil_img, layer)
+    return np.array(result.convert('RGB'))
+
+
+def prim_draw_ellipse(img, cx, cy, rx, ry, angle=0, color=None,
+                      thickness=1, fill=False, opacity: float = 1.0):
+    """Draw an ellipse."""
+    if color is None:
+        color = [255, 255, 255]
+
+    if opacity >= 1.0:
+        result = img.copy()
+        t = -1 if fill else int(thickness)
+        cv2.ellipse(result, (int(cx), int(cy)), (int(rx), int(ry)),
+                    float(angle), 0, 360, tuple(int(c) for c in color), t)
+        return result
+
+    # With opacity (note: PIL doesn't support rotated ellipses easily)
+    # Fall back to cv2 on a separate layer
+    layer = np.zeros((img.shape[0], img.shape[1], 4), dtype=np.uint8)
+    t = -1 if fill else int(thickness)
+    rgba = tuple(int(c) for c in color) + (int(255 * opacity),)
+    cv2.ellipse(layer, (int(cx), int(cy)), (int(rx), int(ry)),
+                float(angle), 0, 360, rgba, t)
+
+    pil_img = Image.fromarray(img).convert('RGBA')
+    pil_layer = Image.fromarray(layer)
+    result = Image.alpha_composite(pil_img, pil_layer)
+    return np.array(result.convert('RGB'))
+
+
+def prim_draw_polygon(img, points, color=None, thickness=1,
+                      fill=False, opacity: float = 1.0):
+    """Draw a polygon from list of [x, y] points."""
+    if color is None:
+        color = [255, 255, 255]
+
+    if opacity >= 1.0:
+        result = img.copy()
+        pts = np.array(points, dtype=np.int32).reshape((-1, 1, 2))
+        if fill:
+            cv2.fillPoly(result, [pts], tuple(int(c) for c in color))
+        else:
+            cv2.polylines(result, [pts], True,
+                         tuple(int(c) for c in color), int(thickness))
+        return result
+
+    # With opacity
+    pil_img = Image.fromarray(img).convert('RGBA')
+    layer = Image.new('RGBA', (pil_img.width, pil_img.height), (0, 0, 0, 0))
+    draw = ImageDraw.Draw(layer)
+
+    pts_flat = [(int(p[0]), int(p[1])) for p in points]
+    rgba = tuple(int(c) for c in color) + (int(255 * opacity),)
+
+    if fill:
+        draw.polygon(pts_flat, fill=rgba)
+    else:
+        draw.polygon(pts_flat, outline=rgba, width=int(thickness))
+
+    result = Image.alpha_composite(pil_img, layer)
+    return np.array(result.convert('RGB'))
+
+
+# =============================================================================
+# PRIMITIVES Export
+# =============================================================================
+
+PRIMITIVES = {
+    # Font management
+    'make-font': prim_make_font,
+    'list-fonts': prim_list_fonts,
+    'font-size': prim_font_size,
+
+    # Text measurement
+    'text-size': prim_text_size,
+    'text-metrics': prim_text_metrics,
+    'fit-text-size': prim_fit_text_size,
+    'fit-font': prim_fit_font,
+
+    # Text drawing
+    'text': prim_text,
+    'text-box': prim_text_box,
+
+    # Legacy text (compatibility)
+    'draw-char': prim_draw_char,
+    'draw-text': prim_draw_text,
+
+    # Rectangles
+    'fill-rect': prim_fill_rect,
+    'draw-rect': prim_draw_rect,
+
+    # Lines and shapes
+    'draw-line': prim_draw_line,
+    'draw-circle': prim_draw_circle,
+    'draw-ellipse': prim_draw_ellipse,
+    'draw-polygon': prim_draw_polygon,
+}
--- a/l1/sexp_effects/primitive_libs/filters.py
+++ b/l1/sexp_effects/primitive_libs/filters.py
@@ -0,0 +1,119 @@
+"""
+Filters Primitives Library
+
+Image filters: blur, sharpen, edges, convolution.
+"""
+import numpy as np
+import cv2
+
+
+def prim_blur(img, radius):
+    """Gaussian blur with given radius."""
+    radius = max(1, int(radius))
+    ksize = radius * 2 + 1
+    return cv2.GaussianBlur(img, (ksize, ksize), 0)
+
+
+def prim_box_blur(img, radius):
+    """Box blur with given radius."""
+    radius = max(1, int(radius))
+    ksize = radius * 2 + 1
+    return cv2.blur(img, (ksize, ksize))
+
+
+def prim_median_blur(img, radius):
+    """Median blur (good for noise removal)."""
+    radius = max(1, int(radius))
+    ksize = radius * 2 + 1
+    return cv2.medianBlur(img, ksize)
+
+
+def prim_bilateral(img, d=9, sigma_color=75, sigma_space=75):
+    """Bilateral filter (edge-preserving blur)."""
+    return cv2.bilateralFilter(img, d, sigma_color, sigma_space)
+
+
+def prim_sharpen(img, amount=1.0):
+    """Sharpen image using unsharp mask."""
+    blurred = cv2.GaussianBlur(img, (0, 0), 3)
+    return cv2.addWeighted(img, 1.0 + amount, blurred, -amount, 0)
+
+
+def prim_edges(img, low=50, high=150):
+    """Canny edge detection."""
+    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+    edges = cv2.Canny(gray, low, high)
+    return cv2.cvtColor(edges, cv2.COLOR_GRAY2RGB)
+
+
+def prim_sobel(img, ksize=3):
+    """Sobel edge detection."""
+    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+    sobelx = cv2.Sobel(gray, cv2.CV_64F, 1, 0, ksize=ksize)
+    sobely = cv2.Sobel(gray, cv2.CV_64F, 0, 1, ksize=ksize)
+    mag = np.sqrt(sobelx**2 + sobely**2)
+    mag = np.clip(mag, 0, 255).astype(np.uint8)
+    return cv2.cvtColor(mag, cv2.COLOR_GRAY2RGB)
+
+
+def prim_laplacian(img, ksize=3):
+    """Laplacian edge detection."""
+    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+    lap = cv2.Laplacian(gray, cv2.CV_64F, ksize=ksize)
+    lap = np.abs(lap)
+    lap = np.clip(lap, 0, 255).astype(np.uint8)
+    return cv2.cvtColor(lap, cv2.COLOR_GRAY2RGB)
+
+
+def prim_emboss(img):
+    """Emboss effect."""
+    kernel = np.array([[-2, -1, 0],
+                       [-1,  1, 1],
+                       [ 0,  1, 2]])
+    result = cv2.filter2D(img, -1, kernel)
+    return np.clip(result + 128, 0, 255).astype(np.uint8)
+
+
+def prim_dilate(img, size=1):
+    """Morphological dilation."""
+    kernel = np.ones((size * 2 + 1, size * 2 + 1), np.uint8)
+    return cv2.dilate(img, kernel)
+
+
+def prim_erode(img, size=1):
+    """Morphological erosion."""
+    kernel = np.ones((size * 2 + 1, size * 2 + 1), np.uint8)
+    return cv2.erode(img, kernel)
+
+
+def prim_convolve(img, kernel):
+    """Apply custom convolution kernel."""
+    kernel = np.array(kernel, dtype=np.float32)
+    return cv2.filter2D(img, -1, kernel)
+
+
+PRIMITIVES = {
+    # Blur
+    'blur': prim_blur,
+    'box-blur': prim_box_blur,
+    'median-blur': prim_median_blur,
+    'bilateral': prim_bilateral,
+
+    # Sharpen
+    'sharpen': prim_sharpen,
+
+    # Edges
+    'edges': prim_edges,
+    'sobel': prim_sobel,
+    'laplacian': prim_laplacian,
+
+    # Effects
+    'emboss': prim_emboss,
+
+    # Morphology
+    'dilate': prim_dilate,
+    'erode': prim_erode,
+
+    # Custom
+    'convolve': prim_convolve,
+}
--- a/l1/sexp_effects/primitive_libs/geometry.py
+++ b/l1/sexp_effects/primitive_libs/geometry.py
@@ -0,0 +1,143 @@
+"""
+Geometry Primitives Library
+
+Geometric transforms: rotate, scale, flip, translate, remap.
+"""
+import numpy as np
+import cv2
+
+
+def prim_translate(img, dx, dy):
+    """Translate image by (dx, dy) pixels."""
+    h, w = img.shape[:2]
+    M = np.float32([[1, 0, dx], [0, 1, dy]])
+    return cv2.warpAffine(img, M, (w, h))
+
+
+def prim_rotate(img, angle, cx=None, cy=None):
+    """Rotate image by angle degrees around center (cx, cy)."""
+    h, w = img.shape[:2]
+    if cx is None:
+        cx = w / 2
+    if cy is None:
+        cy = h / 2
+    M = cv2.getRotationMatrix2D((cx, cy), angle, 1.0)
+    return cv2.warpAffine(img, M, (w, h))
+
+
+def prim_scale(img, sx, sy, cx=None, cy=None):
+    """Scale image by (sx, sy) around center (cx, cy)."""
+    h, w = img.shape[:2]
+    if cx is None:
+        cx = w / 2
+    if cy is None:
+        cy = h / 2
+
+    # Build transform matrix
+    M = np.float32([
+        [sx, 0, cx * (1 - sx)],
+        [0, sy, cy * (1 - sy)]
+    ])
+    return cv2.warpAffine(img, M, (w, h))
+
+
+def prim_flip_h(img):
+    """Flip image horizontally."""
+    return cv2.flip(img, 1)
+
+
+def prim_flip_v(img):
+    """Flip image vertically."""
+    return cv2.flip(img, 0)
+
+
+def prim_flip(img, direction="horizontal"):
+    """Flip image in given direction."""
+    if direction in ("horizontal", "h"):
+        return prim_flip_h(img)
+    elif direction in ("vertical", "v"):
+        return prim_flip_v(img)
+    elif direction in ("both", "hv", "vh"):
+        return cv2.flip(img, -1)
+    return img
+
+
+def prim_transpose(img):
+    """Transpose image (swap x and y)."""
+    return np.transpose(img, (1, 0, 2))
+
+
+def prim_remap(img, map_x, map_y):
+    """Remap image using coordinate maps."""
+    return cv2.remap(img, map_x.astype(np.float32),
+                     map_y.astype(np.float32),
+                     cv2.INTER_LINEAR)
+
+
+def prim_make_coords(w, h):
+    """Create coordinate grids for remapping."""
+    x = np.arange(w, dtype=np.float32)
+    y = np.arange(h, dtype=np.float32)
+    map_x, map_y = np.meshgrid(x, y)
+    return (map_x, map_y)
+
+
+def prim_perspective(img, src_pts, dst_pts):
+    """Apply perspective transform."""
+    src = np.float32(src_pts)
+    dst = np.float32(dst_pts)
+    M = cv2.getPerspectiveTransform(src, dst)
+    h, w = img.shape[:2]
+    return cv2.warpPerspective(img, M, (w, h))
+
+
+def prim_affine(img, src_pts, dst_pts):
+    """Apply affine transform using 3 point pairs."""
+    src = np.float32(src_pts)
+    dst = np.float32(dst_pts)
+    M = cv2.getAffineTransform(src, dst)
+    h, w = img.shape[:2]
+    return cv2.warpAffine(img, M, (w, h))
+
+
+def _get_legacy_geometry_primitives():
+    """Import geometry primitives from legacy primitives module."""
+    from sexp_effects.primitives import (
+        prim_coords_x,
+        prim_coords_y,
+        prim_ripple_displace,
+        prim_fisheye_displace,
+        prim_kaleidoscope_displace,
+    )
+    return {
+        'coords-x': prim_coords_x,
+        'coords-y': prim_coords_y,
+        'ripple-displace': prim_ripple_displace,
+        'fisheye-displace': prim_fisheye_displace,
+        'kaleidoscope-displace': prim_kaleidoscope_displace,
+    }
+
+
+PRIMITIVES = {
+    # Basic transforms
+    'translate': prim_translate,
+    'rotate-img': prim_rotate,
+    'scale-img': prim_scale,
+
+    # Flips
+    'flip-h': prim_flip_h,
+    'flip-v': prim_flip_v,
+    'flip': prim_flip,
+    'transpose': prim_transpose,
+
+    # Remapping
+    'remap': prim_remap,
+    'make-coords': prim_make_coords,
+
+    # Advanced transforms
+    'perspective': prim_perspective,
+    'affine': prim_affine,
+
+    # Displace / coordinate ops (from legacy primitives)
+    **_get_legacy_geometry_primitives(),
+}
--- a/l1/sexp_effects/primitive_libs/geometry_gpu.py
+++ b/l1/sexp_effects/primitive_libs/geometry_gpu.py
@@ -0,0 +1,403 @@
+"""
+GPU-Accelerated Geometry Primitives Library
+
+Uses CuPy for CUDA-accelerated image transforms.
+Falls back to CPU if GPU unavailable.
+
+Performance Mode:
+- Set STREAMING_GPU_PERSIST=1 to keep frames on GPU between operations
+- This dramatically improves performance by avoiding CPU<->GPU transfers
+- Frames only transfer to CPU at final output
+"""
+import os
+import numpy as np
+
+# Try to import CuPy for GPU acceleration
+try:
+    import cupy as cp
+    from cupyx.scipy import ndimage as cpndimage
+    GPU_AVAILABLE = True
+    print("[geometry_gpu] CuPy GPU acceleration enabled")
+except ImportError:
+    cp = np
+    GPU_AVAILABLE = False
+    print("[geometry_gpu] CuPy not available, using CPU fallback")
+
+# GPU persistence mode - keep frames on GPU between operations
+# Set STREAMING_GPU_PERSIST=1 for maximum performance
+GPU_PERSIST = os.environ.get("STREAMING_GPU_PERSIST", "0") == "1"
+if GPU_AVAILABLE and GPU_PERSIST:
+    print("[geometry_gpu] GPU persistence enabled - frames stay on GPU")
+
+
+def _to_gpu(img):
+    """Move image to GPU if available."""
+    if GPU_AVAILABLE and not isinstance(img, cp.ndarray):
+        return cp.asarray(img)
+    return img
+
+
+def _to_cpu(img):
+    """Move image back to CPU (only if GPU_PERSIST is disabled)."""
+    if not GPU_PERSIST and GPU_AVAILABLE and isinstance(img, cp.ndarray):
+        return cp.asnumpy(img)
+    return img
+
+
+def _ensure_output_format(img):
+    """Ensure output is in correct format based on GPU_PERSIST setting."""
+    return _to_cpu(img)
+
+
+def prim_rotate(img, angle, cx=None, cy=None):
+    """Rotate image by angle degrees around center (cx, cy).
+
+    Uses fast CUDA kernel when available (< 1ms vs 20ms for scipy).
+    """
+    if not GPU_AVAILABLE:
+        # Fallback to OpenCV
+        import cv2
+        h, w = img.shape[:2]
+        if cx is None:
+            cx = w / 2
+        if cy is None:
+            cy = h / 2
+        M = cv2.getRotationMatrix2D((cx, cy), angle, 1.0)
+        return cv2.warpAffine(img, M, (w, h))
+
+    # Use fast CUDA kernel (prim_rotate_gpu defined below)
+    return prim_rotate_gpu(img, angle, cx, cy)
+
+
+def prim_scale(img, sx, sy, cx=None, cy=None):
+    """Scale image by (sx, sy) around center (cx, cy)."""
+    if not GPU_AVAILABLE:
+        import cv2
+        h, w = img.shape[:2]
+        if cx is None:
+            cx = w / 2
+        if cy is None:
+            cy = h / 2
+        M = np.float32([
+            [sx, 0, cx * (1 - sx)],
+            [0, sy, cy * (1 - sy)]
+        ])
+        return cv2.warpAffine(img, M, (w, h))
+
+    img_gpu = _to_gpu(img)
+    h, w = img_gpu.shape[:2]
+
+    if cx is None:
+        cx = w / 2
+    if cy is None:
+        cy = h / 2
+
+    # Use cupyx.scipy.ndimage.zoom
+    if img_gpu.ndim == 3:
+        zoom_factors = (sy, sx, 1)  # Don't zoom color channels
+    else:
+        zoom_factors = (sy, sx)
+
+    zoomed = cpndimage.zoom(img_gpu, zoom_factors, order=1)
+
+    # Crop/pad to original size
+    zh, zw = zoomed.shape[:2]
+    result = cp.zeros_like(img_gpu)
+
+    # Calculate offsets
+    src_y = max(0, (zh - h) // 2)
+    src_x = max(0, (zw - w) // 2)
+    dst_y = max(0, (h - zh) // 2)
+    dst_x = max(0, (w - zw) // 2)
+
+    copy_h = min(h - dst_y, zh - src_y)
+    copy_w = min(w - dst_x, zw - src_x)
+
+    result[dst_y:dst_y+copy_h, dst_x:dst_x+copy_w] = zoomed[src_y:src_y+copy_h, src_x:src_x+copy_w]
+
+    return _to_cpu(result)
+
+
+def prim_translate(img, dx, dy):
+    """Translate image by (dx, dy) pixels."""
+    if not GPU_AVAILABLE:
+        import cv2
+        h, w = img.shape[:2]
+        M = np.float32([[1, 0, dx], [0, 1, dy]])
+        return cv2.warpAffine(img, M, (w, h))
+
+    img_gpu = _to_gpu(img)
+    # Use cupyx.scipy.ndimage.shift
+    if img_gpu.ndim == 3:
+        shift = (dy, dx, 0)  # Don't shift color channels
+    else:
+        shift = (dy, dx)
+
+    shifted = cpndimage.shift(img_gpu, shift, order=1)
+    return _to_cpu(shifted)
+
+
+def prim_flip_h(img):
+    """Flip image horizontally."""
+    if GPU_AVAILABLE:
+        img_gpu = _to_gpu(img)
+        return _to_cpu(cp.flip(img_gpu, axis=1))
+    return np.flip(img, axis=1)
+
+
+def prim_flip_v(img):
+    """Flip image vertically."""
+    if GPU_AVAILABLE:
+        img_gpu = _to_gpu(img)
+        return _to_cpu(cp.flip(img_gpu, axis=0))
+    return np.flip(img, axis=0)
+
+
+def prim_flip(img, direction="horizontal"):
+    """Flip image in given direction."""
+    if direction in ("horizontal", "h"):
+        return prim_flip_h(img)
+    elif direction in ("vertical", "v"):
+        return prim_flip_v(img)
+    elif direction in ("both", "hv", "vh"):
+        if GPU_AVAILABLE:
+            img_gpu = _to_gpu(img)
+            return _to_cpu(cp.flip(cp.flip(img_gpu, axis=0), axis=1))
+        return np.flip(np.flip(img, axis=0), axis=1)
+    return img
+
+
+# CUDA kernel for ripple effect
+if GPU_AVAILABLE:
+    _ripple_kernel = cp.RawKernel(r'''
+    extern "C" __global__
+    void ripple(const unsigned char* src, unsigned char* dst,
+                int width, int height, int channels,
+                float amplitude, float frequency, float decay,
+                float speed, float time, float cx, float cy) {
+        int x = blockDim.x * blockIdx.x + threadIdx.x;
+        int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x >= width || y >= height) return;
+
+        // Distance from center
+        float dx = x - cx;
+        float dy = y - cy;
+        float dist = sqrtf(dx * dx + dy * dy);
+
+        // Ripple displacement
+        float wave = sinf(dist * frequency * 0.1f - time * speed) * amplitude;
+        float falloff = expf(-dist * decay * 0.01f);
+        float displacement = wave * falloff;
+
+        // Direction from center
+        float len = dist + 0.0001f;  // Avoid division by zero
+        float dir_x = dx / len;
+        float dir_y = dy / len;
+
+        // Source coordinates
+        float src_x = x - dir_x * displacement;
+        float src_y = y - dir_y * displacement;
+
+        // Clamp to bounds
+        src_x = fmaxf(0.0f, fminf(width - 1.0f, src_x));
+        src_y = fmaxf(0.0f, fminf(height - 1.0f, src_y));
+
+        // Bilinear interpolation
+        int x0 = (int)src_x;
+        int y0 = (int)src_y;
+        int x1 = min(x0 + 1, width - 1);
+        int y1 = min(y0 + 1, height - 1);
+
+        float fx = src_x - x0;
+        float fy = src_y - y0;
+
+        for (int c = 0; c < channels; c++) {
+            float v00 = src[(y0 * width + x0) * channels + c];
+            float v10 = src[(y0 * width + x1) * channels + c];
+            float v01 = src[(y1 * width + x0) * channels + c];
+            float v11 = src[(y1 * width + x1) * channels + c];
+
+            float v0 = v00 * (1 - fx) + v10 * fx;
+            float v1 = v01 * (1 - fx) + v11 * fx;
+            float val = v0 * (1 - fy) + v1 * fy;
+
+            dst[(y * width + x) * channels + c] = (unsigned char)fminf(255.0f, fmaxf(0.0f, val));
+        }
+    }
+    ''', 'ripple')
+
+
+def prim_ripple(img, amplitude=10.0, frequency=8.0, decay=2.0, speed=5.0,
+                time=0.0, center_x=None, center_y=None):
+    """Apply ripple distortion effect."""
+    h, w = img.shape[:2]
+    channels = img.shape[2] if img.ndim == 3 else 1
+
+    if center_x is None:
+        center_x = w / 2
+    if center_y is None:
+        center_y = h / 2
+
+    if not GPU_AVAILABLE:
+        # CPU fallback using coordinate mapping
+        import cv2
+        y_coords, x_coords = np.mgrid[0:h, 0:w].astype(np.float32)
+
+        dx = x_coords - center_x
+        dy = y_coords - center_y
+        dist = np.sqrt(dx**2 + dy**2)
+
+        wave = np.sin(dist * frequency * 0.1 - time * speed) * amplitude
+        falloff = np.exp(-dist * decay * 0.01)
+        displacement = wave * falloff
+
+        length = dist + 0.0001
+        dir_x = dx / length
+        dir_y = dy / length
+
+        map_x = (x_coords - dir_x * displacement).astype(np.float32)
+        map_y = (y_coords - dir_y * displacement).astype(np.float32)
+
+        return cv2.remap(img, map_x, map_y, cv2.INTER_LINEAR)
+
+    # GPU implementation
+    img_gpu = _to_gpu(img.astype(np.uint8))
+    if img_gpu.ndim == 2:
+        img_gpu = img_gpu[:, :, cp.newaxis]
+        channels = 1
+
+    dst = cp.zeros_like(img_gpu)
+
+    block = (16, 16)
+    grid = ((w + block[0] - 1) // block[0], (h + block[1] - 1) // block[1])
+
+    _ripple_kernel(grid, block, (
+        img_gpu, dst,
+        np.int32(w), np.int32(h), np.int32(channels),
+        np.float32(amplitude), np.float32(frequency), np.float32(decay),
+        np.float32(speed), np.float32(time),
+        np.float32(center_x), np.float32(center_y)
+    ))
+
+    result = _to_cpu(dst)
+    if channels == 1:
+        result = result[:, :, 0]
+    return result
+
+
+# CUDA kernel for fast rotation with bilinear interpolation
+if GPU_AVAILABLE:
+    _rotate_kernel = cp.RawKernel(r'''
+    extern "C" __global__
+    void rotate_img(const unsigned char* src, unsigned char* dst,
+                    int width, int height, int channels,
+                    float cos_a, float sin_a, float cx, float cy) {
+        int x = blockDim.x * blockIdx.x + threadIdx.x;
+        int y = blockDim.y * blockIdx.y + threadIdx.y;
+
+        if (x >= width || y >= height) return;
+
+        // Translate to center, rotate, translate back
+        float dx = x - cx;
+        float dy = y - cy;
+
+        float src_x = cos_a * dx + sin_a * dy + cx;
+        float src_y = -sin_a * dx + cos_a * dy + cy;
+
+        // Check bounds
+        if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) {
+            for (int c = 0; c < channels; c++) {
+                dst[(y * width + x) * channels + c] = 0;
+            }
+            return;
+        }
+
+        // Bilinear interpolation
+        int x0 = (int)src_x;
+        int y0 = (int)src_y;
+        int x1 = x0 + 1;
+        int y1 = y0 + 1;
+
+        float fx = src_x - x0;
+        float fy = src_y - y0;
+
+        for (int c = 0; c < channels; c++) {
+            float v00 = src[(y0 * width + x0) * channels + c];
+            float v10 = src[(y0 * width + x1) * channels + c];
+            float v01 = src[(y1 * width + x0) * channels + c];
+            float v11 = src[(y1 * width + x1) * channels + c];
+
+            float v0 = v00 * (1 - fx) + v10 * fx;
+            float v1 = v01 * (1 - fx) + v11 * fx;
+            float val = v0 * (1 - fy) + v1 * fy;
+
+            dst[(y * width + x) * channels + c] = (unsigned char)fminf(255.0f, fmaxf(0.0f, val));
+        }
+    }
+    ''', 'rotate_img')
+
+
+def prim_rotate_gpu(img, angle, cx=None, cy=None):
+    """Fast GPU rotation using custom CUDA kernel."""
+    if not GPU_AVAILABLE:
+        return prim_rotate(img, angle, cx, cy)
+
+    h, w = img.shape[:2]
+    channels = img.shape[2] if img.ndim == 3 else 1
+
+    if cx is None:
+        cx = w / 2
+    if cy is None:
+        cy = h / 2
+
+    img_gpu = _to_gpu(img.astype(np.uint8))
+    if img_gpu.ndim == 2:
+        img_gpu = img_gpu[:, :, cp.newaxis]
+        channels = 1
+
+    dst = cp.zeros_like(img_gpu)
+
+    # Convert angle to radians
+    rad = np.radians(angle)
+    cos_a = np.cos(rad)
+    sin_a = np.sin(rad)
+
+    block = (16, 16)
+    grid = ((w + block[0] - 1) // block[0], (h + block[1] - 1) // block[1])
+
+    _rotate_kernel(grid, block, (
+        img_gpu, dst,
+        np.int32(w), np.int32(h), np.int32(channels),
+        np.float32(cos_a), np.float32(sin_a),
+        np.float32(cx), np.float32(cy)
+    ))
+
+    result = _to_cpu(dst)
+    if channels == 1:
+        result = result[:, :, 0]
+    return result
+
+
+# Import CPU primitives as fallbacks for functions we don't GPU-accelerate
+def _get_cpu_primitives():
+    """Get all primitives from CPU geometry module as fallbacks."""
+    from sexp_effects.primitive_libs import geometry
+    return geometry.PRIMITIVES
+
+
+# Export functions - start with CPU primitives, then override with GPU versions
+PRIMITIVES = _get_cpu_primitives().copy()
+
+# Override specific primitives with GPU-accelerated versions
+PRIMITIVES.update({
+    'translate': prim_translate,
+    'rotate': prim_rotate_gpu if GPU_AVAILABLE else prim_rotate,  # Fast CUDA kernel
+    'rotate-img': prim_rotate_gpu if GPU_AVAILABLE else prim_rotate,  # Alias
+    'scale-img': prim_scale,
+    'flip-h': prim_flip_h,
+    'flip-v': prim_flip_v,
+    'flip': prim_flip,
+    'ripple': prim_ripple,  # Fast CUDA kernel
+    # Note: ripple-displace uses CPU version (different API - returns coords, not image)
+})
--- a/l1/sexp_effects/primitive_libs/image.py
+++ b/l1/sexp_effects/primitive_libs/image.py
@@ -0,0 +1,150 @@
+"""
+Image Primitives Library
+
+Basic image operations: dimensions, pixels, resize, crop, paste.
+"""
+import numpy as np
+import cv2
+
+
+def prim_width(img):
+    if isinstance(img, (list, tuple)):
+        raise TypeError(f"image:width expects an image array, got {type(img).__name__} with {len(img)} elements")
+    return img.shape[1]
+
+
+def prim_height(img):
+    if isinstance(img, (list, tuple)):
+        import sys
+        print(f"DEBUG image:height got list: {img[:3]}... (types: {[type(x).__name__ for x in img[:3]]})", file=sys.stderr)
+        raise TypeError(f"image:height expects an image array, got {type(img).__name__} with {len(img)} elements: {img}")
+    return img.shape[0]
+
+
+def prim_make_image(w, h, color=None):
+    """Create a new image filled with color (default black)."""
+    if color is None:
+        color = [0, 0, 0]
+    img = np.zeros((h, w, 3), dtype=np.uint8)
+    img[:] = color
+    return img
+
+
+def prim_copy(img):
+    return img.copy()
+
+
+def prim_pixel(img, x, y):
+    """Get pixel color at (x, y) as [r, g, b]."""
+    h, w = img.shape[:2]
+    if 0 <= x < w and 0 <= y < h:
+        return list(img[int(y), int(x)])
+    return [0, 0, 0]
+
+
+def prim_set_pixel(img, x, y, color):
+    """Set pixel at (x, y) to color, returns modified image."""
+    result = img.copy()
+    h, w = result.shape[:2]
+    if 0 <= x < w and 0 <= y < h:
+        result[int(y), int(x)] = color
+    return result
+
+
+def prim_sample(img, x, y):
+    """Bilinear sample at float coordinates, returns [r, g, b] as floats."""
+    h, w = img.shape[:2]
+    x = max(0, min(w - 1.001, x))
+    y = max(0, min(h - 1.001, y))
+
+    x0, y0 = int(x), int(y)
+    x1, y1 = min(x0 + 1, w - 1), min(y0 + 1, h - 1)
+    fx, fy = x - x0, y - y0
+
+    c00 = img[y0, x0].astype(float)
+    c10 = img[y0, x1].astype(float)
+    c01 = img[y1, x0].astype(float)
+    c11 = img[y1, x1].astype(float)
+
+    top = c00 * (1 - fx) + c10 * fx
+    bottom = c01 * (1 - fx) + c11 * fx
+    return list(top * (1 - fy) + bottom * fy)
+
+
+def prim_channel(img, c):
+    """Extract single channel (0=R, 1=G, 2=B)."""
+    return img[:, :, c]
+
+
+def prim_merge_channels(r, g, b):
+    """Merge three single-channel arrays into RGB image."""
+    return np.stack([r, g, b], axis=2).astype(np.uint8)
+
+
+def prim_resize(img, w, h, mode="linear"):
+    """Resize image to w x h."""
+    interp = cv2.INTER_LINEAR
+    if mode == "nearest":
+        interp = cv2.INTER_NEAREST
+    elif mode == "cubic":
+        interp = cv2.INTER_CUBIC
+    elif mode == "area":
+        interp = cv2.INTER_AREA
+    return cv2.resize(img, (int(w), int(h)), interpolation=interp)
+
+
+def prim_crop(img, x, y, w, h):
+    """Crop rectangle from image."""
+    x, y, w, h = int(x), int(y), int(w), int(h)
+    ih, iw = img.shape[:2]
+    x = max(0, min(x, iw - 1))
+    y = max(0, min(y, ih - 1))
+    w = min(w, iw - x)
+    h = min(h, ih - y)
+    return img[y:y+h, x:x+w].copy()
+
+
+def prim_paste(dst, src, x, y):
+    """Paste src onto dst at position (x, y)."""
+    result = dst.copy()
+    x, y = int(x), int(y)
+    sh, sw = src.shape[:2]
+    dh, dw = dst.shape[:2]
+
+    # Clip to bounds
+    sx1 = max(0, -x)
+    sy1 = max(0, -y)
+    dx1 = max(0, x)
+    dy1 = max(0, y)
+    sx2 = min(sw, dw - x)
+    sy2 = min(sh, dh - y)
+
+    if sx2 > sx1 and sy2 > sy1:
+        result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)] = src[sy1:sy2, sx1:sx2]
+
+    return result
+
+
+PRIMITIVES = {
+    # Dimensions
+    'width': prim_width,
+    'height': prim_height,
+
+    # Creation
+    'make-image': prim_make_image,
+    'copy': prim_copy,
+
+    # Pixel access
+    'pixel': prim_pixel,
+    'set-pixel': prim_set_pixel,
+    'sample': prim_sample,
+
+    # Channels
+    'channel': prim_channel,
+    'merge-channels': prim_merge_channels,
+
+    # Geometry
+    'resize': prim_resize,
+    'crop': prim_crop,
+    'paste': prim_paste,
+}
--- a/l1/sexp_effects/primitive_libs/math.py
+++ b/l1/sexp_effects/primitive_libs/math.py
@@ -0,0 +1,164 @@
+"""
+Math Primitives Library
+
+Trigonometry, rounding, clamping, random numbers, etc.
+"""
+import math
+import random as rand_module
+
+
+def prim_sin(x):
+    return math.sin(x)
+
+
+def prim_cos(x):
+    return math.cos(x)
+
+
+def prim_tan(x):
+    return math.tan(x)
+
+
+def prim_asin(x):
+    return math.asin(x)
+
+
+def prim_acos(x):
+    return math.acos(x)
+
+
+def prim_atan(x):
+    return math.atan(x)
+
+
+def prim_atan2(y, x):
+    return math.atan2(y, x)
+
+
+def prim_sqrt(x):
+    return math.sqrt(x)
+
+
+def prim_pow(x, y):
+    return math.pow(x, y)
+
+
+def prim_exp(x):
+    return math.exp(x)
+
+
+def prim_log(x, base=None):
+    if base is None:
+        return math.log(x)
+    return math.log(x, base)
+
+
+def prim_abs(x):
+    return abs(x)
+
+
+def prim_floor(x):
+    return math.floor(x)
+
+
+def prim_ceil(x):
+    return math.ceil(x)
+
+
+def prim_round(x):
+    return round(x)
+
+
+def prim_min(*args):
+    if len(args) == 1 and hasattr(args[0], '__iter__'):
+        return min(args[0])
+    return min(args)
+
+
+def prim_max(*args):
+    if len(args) == 1 and hasattr(args[0], '__iter__'):
+        return max(args[0])
+    return max(args)
+
+
+def prim_clamp(x, lo, hi):
+    return max(lo, min(hi, x))
+
+
+def prim_lerp(a, b, t):
+    """Linear interpolation: a + (b - a) * t"""
+    return a + (b - a) * t
+
+
+def prim_smoothstep(edge0, edge1, x):
+    """Smooth interpolation between 0 and 1."""
+    t = prim_clamp((x - edge0) / (edge1 - edge0), 0.0, 1.0)
+    return t * t * (3 - 2 * t)
+
+
+def prim_random(lo=0.0, hi=1.0):
+    return rand_module.uniform(lo, hi)
+
+
+def prim_randint(lo, hi):
+    return rand_module.randint(lo, hi)
+
+
+def prim_gaussian(mean=0.0, std=1.0):
+    return rand_module.gauss(mean, std)
+
+
+def prim_sign(x):
+    if x > 0:
+        return 1
+    elif x < 0:
+        return -1
+    return 0
+
+
+def prim_fract(x):
+    """Fractional part of x."""
+    return x - math.floor(x)
+
+
+PRIMITIVES = {
+    # Trigonometry
+    'sin': prim_sin,
+    'cos': prim_cos,
+    'tan': prim_tan,
+    'asin': prim_asin,
+    'acos': prim_acos,
+    'atan': prim_atan,
+    'atan2': prim_atan2,
+
+    # Powers and roots
+    'sqrt': prim_sqrt,
+    'pow': prim_pow,
+    'exp': prim_exp,
+    'log': prim_log,
+
+    # Rounding
+    'abs': prim_abs,
+    'floor': prim_floor,
+    'ceil': prim_ceil,
+    'round': prim_round,
+    'sign': prim_sign,
+    'fract': prim_fract,
+
+    # Min/max/clamp
+    'min': prim_min,
+    'max': prim_max,
+    'clamp': prim_clamp,
+    'lerp': prim_lerp,
+    'smoothstep': prim_smoothstep,
+
+    # Random
+    'random': prim_random,
+    'randint': prim_randint,
+    'gaussian': prim_gaussian,
+
+    # Constants
+    'pi': math.pi,
+    'tau': math.tau,
+    'e': math.e,
+}
--- a/l1/sexp_effects/primitive_libs/streaming.py
+++ b/l1/sexp_effects/primitive_libs/streaming.py
@@ -0,0 +1,593 @@
+"""
+Streaming primitives for video/audio processing.
+
+These primitives handle video source reading and audio analysis,
+keeping the interpreter completely generic.
+
+GPU Acceleration:
+- Set STREAMING_GPU_PERSIST=1 to output CuPy arrays (frames stay on GPU)
+- Hardware video decoding (NVDEC) is used when available
+- Dramatically improves performance on GPU nodes
+
+Async Prefetching:
+- Set STREAMING_PREFETCH=1 to enable background frame prefetching
+- Decodes upcoming frames while current frame is being processed
+"""
+
+import os
+import numpy as np
+import subprocess
+import json
+import threading
+from collections import deque
+from pathlib import Path
+
+# Try to import CuPy for GPU acceleration
+try:
+    import cupy as cp
+    CUPY_AVAILABLE = True
+except ImportError:
+    cp = None
+    CUPY_AVAILABLE = False
+
+# GPU persistence mode - output CuPy arrays instead of numpy
+# Disabled by default until all primitives support GPU frames
+GPU_PERSIST = os.environ.get("STREAMING_GPU_PERSIST", "0") == "1" and CUPY_AVAILABLE
+
+# Async prefetch mode - decode frames in background thread
+PREFETCH_ENABLED = os.environ.get("STREAMING_PREFETCH", "1") == "1"
+PREFETCH_BUFFER_SIZE = int(os.environ.get("STREAMING_PREFETCH_SIZE", "10"))
+
+# Check for hardware decode support (cached)
+_HWDEC_AVAILABLE = None
+
+
+def _check_hwdec():
+    """Check if NVIDIA hardware decode is available."""
+    global _HWDEC_AVAILABLE
+    if _HWDEC_AVAILABLE is not None:
+        return _HWDEC_AVAILABLE
+
+    try:
+        result = subprocess.run(["nvidia-smi"], capture_output=True, timeout=2)
+        if result.returncode != 0:
+            _HWDEC_AVAILABLE = False
+            return False
+        result = subprocess.run(["ffmpeg", "-hwaccels"], capture_output=True, text=True, timeout=5)
+        _HWDEC_AVAILABLE = "cuda" in result.stdout
+    except Exception:
+        _HWDEC_AVAILABLE = False
+
+    return _HWDEC_AVAILABLE
+
+
+class VideoSource:
+    """Video source with persistent streaming pipe for fast sequential reads."""
+
+    def __init__(self, path: str, fps: float = 30):
+        self.path = Path(path)
+        self.fps = fps  # Output fps for the stream
+        self._frame_size = None
+        self._duration = None
+        self._proc = None  # Persistent ffmpeg process
+        self._stream_time = 0.0  # Current position in stream
+        self._frame_time = 1.0 / fps  # Time per frame at output fps
+        self._last_read_time = -1
+        self._cached_frame = None
+
+        # Check if file exists
+        if not self.path.exists():
+            raise FileNotFoundError(f"Video file not found: {self.path}")
+
+        # Get video info
+        cmd = ["ffprobe", "-v", "quiet", "-print_format", "json",
+               "-show_streams", str(self.path)]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            raise RuntimeError(f"Failed to probe video '{self.path}': {result.stderr}")
+        try:
+            info = json.loads(result.stdout)
+        except json.JSONDecodeError:
+            raise RuntimeError(f"Invalid video file or ffprobe failed: {self.path}")
+
+        for stream in info.get("streams", []):
+            if stream.get("codec_type") == "video":
+                self._frame_size = (stream.get("width", 720), stream.get("height", 720))
+                # Try direct duration field first
+                if "duration" in stream:
+                    self._duration = float(stream["duration"])
+                # Fall back to tags.DURATION (webm format: "00:01:00.124000000")
+                elif "tags" in stream and "DURATION" in stream["tags"]:
+                    dur_str = stream["tags"]["DURATION"]
+                    parts = dur_str.split(":")
+                    if len(parts) == 3:
+                        h, m, s = parts
+                        self._duration = int(h) * 3600 + int(m) * 60 + float(s)
+                break
+
+        # Fallback: check format duration if stream duration not found
+        if self._duration is None and "format" in info and "duration" in info["format"]:
+            self._duration = float(info["format"]["duration"])
+
+        if not self._frame_size:
+            self._frame_size = (720, 720)
+
+        import sys
+        print(f"VideoSource: {self.path.name} duration={self._duration} size={self._frame_size}", file=sys.stderr)
+
+    def _start_stream(self, seek_time: float = 0):
+        """Start or restart the ffmpeg streaming process.
+
+        Uses NVIDIA hardware decoding (NVDEC) when available for better performance.
+        """
+        if self._proc:
+            self._proc.kill()
+            self._proc = None
+
+        # Check file exists before trying to open
+        if not self.path.exists():
+            raise FileNotFoundError(f"Video file not found: {self.path}")
+
+        w, h = self._frame_size
+
+        # Build ffmpeg command with optional hardware decode
+        cmd = ["ffmpeg", "-v", "error"]
+
+        # Use hardware decode if available (significantly faster)
+        if _check_hwdec():
+            cmd.extend(["-hwaccel", "cuda"])
+
+        cmd.extend([
+            "-ss", f"{seek_time:.3f}",
+            "-i", str(self.path),
+            "-f", "rawvideo", "-pix_fmt", "rgb24",
+            "-s", f"{w}x{h}",
+            "-r", str(self.fps),  # Output at specified fps
+            "-"
+        ])
+
+        self._proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        self._stream_time = seek_time
+
+        # Check if process started successfully by reading first bit of stderr
+        import select
+        import sys
+        readable, _, _ = select.select([self._proc.stderr], [], [], 0.5)
+        if readable:
+            err = self._proc.stderr.read(4096).decode('utf-8', errors='ignore')
+            if err:
+                print(f"ffmpeg error for {self.path.name}: {err}", file=sys.stderr)
+
+    def _read_frame_from_stream(self):
+        """Read one frame from the stream.
+
+        Returns CuPy array if GPU_PERSIST is enabled, numpy array otherwise.
+        """
+        w, h = self._frame_size
+        frame_size = w * h * 3
+
+        if not self._proc or self._proc.poll() is not None:
+            return None
+
+        data = self._proc.stdout.read(frame_size)
+        if len(data) < frame_size:
+            return None
+
+        frame = np.frombuffer(data, dtype=np.uint8).reshape((h, w, 3)).copy()
+
+        # Transfer to GPU if persistence mode enabled
+        if GPU_PERSIST:
+            return cp.asarray(frame)
+        return frame
+
+    def read(self) -> np.ndarray:
+        """Read frame (uses last cached or t=0)."""
+        if self._cached_frame is not None:
+            return self._cached_frame
+        return self.read_at(0)
+
+    def read_at(self, t: float) -> np.ndarray:
+        """Read frame at specific time using streaming with smart seeking."""
+        # Cache check - return same frame for same time
+        if t == self._last_read_time and self._cached_frame is not None:
+            return self._cached_frame
+
+        w, h = self._frame_size
+
+        # Loop time if video is shorter
+        seek_time = t
+        if self._duration and self._duration > 0:
+            seek_time = t % self._duration
+            # If we're within 0.1s of the end, wrap to beginning to avoid EOF issues
+            if seek_time > self._duration - 0.1:
+                seek_time = 0.0
+
+        # Decide whether to seek or continue streaming
+        # Seek if: no stream, going backwards (more than 1 frame), or jumping more than 2 seconds ahead
+        # Allow small backward tolerance to handle floating point and timing jitter
+        need_seek = (
+            self._proc is None or
+            self._proc.poll() is not None or
+            seek_time < self._stream_time - self._frame_time or  # More than 1 frame backward
+            seek_time > self._stream_time + 2.0
+        )
+
+        if need_seek:
+            import sys
+            reason = "no proc" if self._proc is None else "proc dead" if self._proc.poll() is not None else "backward" if seek_time < self._stream_time else "jump"
+            print(f"SEEK {self.path.name}: t={t:.4f} seek={seek_time:.4f} stream={self._stream_time:.4f} ({reason})", file=sys.stderr)
+            self._start_stream(seek_time)
+
+        # Skip frames to reach target time
+        skip_retries = 0
+        while self._stream_time + self._frame_time <= seek_time:
+            frame = self._read_frame_from_stream()
+            if frame is None:
+                # Stream ended or failed - restart from seek point
+                import time
+                skip_retries += 1
+                if skip_retries > 3:
+                    # Give up skipping, just start fresh at seek_time
+                    self._start_stream(seek_time)
+                    time.sleep(0.1)
+                    break
+                self._start_stream(seek_time)
+                time.sleep(0.05)
+                continue
+            self._stream_time += self._frame_time
+            skip_retries = 0  # Reset on successful read
+
+        # Read the target frame with retry logic
+        frame = None
+        max_retries = 3
+        for attempt in range(max_retries):
+            frame = self._read_frame_from_stream()
+            if frame is not None:
+                break
+
+            # Stream failed - try restarting
+            import sys
+            import time
+            print(f"RETRY {self.path.name}: attempt {attempt+1}/{max_retries} at t={t:.2f}", file=sys.stderr)
+
+            # Check for ffmpeg errors
+            if self._proc and self._proc.stderr:
+                try:
+                    import select
+                    readable, _, _ = select.select([self._proc.stderr], [], [], 0.1)
+                    if readable:
+                        err = self._proc.stderr.read(4096).decode('utf-8', errors='ignore')
+                        if err:
+                            print(f"ffmpeg error: {err}", file=sys.stderr)
+                except:
+                    pass
+
+            # Wait a bit and restart
+            time.sleep(0.1)
+            self._start_stream(seek_time)
+
+            # Give ffmpeg time to start
+            time.sleep(0.1)
+
+        if frame is None:
+            import sys
+            raise RuntimeError(f"Failed to read video frame from {self.path.name} at t={t:.2f} after {max_retries} retries")
+        else:
+            self._stream_time += self._frame_time
+
+        self._last_read_time = t
+        self._cached_frame = frame
+        return frame
+
+    def skip(self):
+        """No-op for seek-based reading."""
+        pass
+
+    @property
+    def size(self):
+        return self._frame_size
+
+    def close(self):
+        if self._proc:
+            self._proc.kill()
+            self._proc = None
+
+
+class PrefetchingVideoSource:
+    """
+    Video source with background prefetching for improved performance.
+
+    Wraps VideoSource and adds a background thread that pre-decodes
+    upcoming frames while the main thread processes the current frame.
+    """
+
+    def __init__(self, path: str, fps: float = 30, buffer_size: int = None):
+        self._source = VideoSource(path, fps)
+        self._buffer_size = buffer_size or PREFETCH_BUFFER_SIZE
+        self._buffer = {}  # time -> frame
+        self._buffer_lock = threading.Lock()
+        self._prefetch_time = 0.0
+        self._frame_time = 1.0 / fps
+        self._stop_event = threading.Event()
+        self._request_event = threading.Event()
+        self._target_time = 0.0
+
+        # Start prefetch thread
+        self._thread = threading.Thread(target=self._prefetch_loop, daemon=True)
+        self._thread.start()
+
+        import sys
+        print(f"PrefetchingVideoSource: {path} buffer_size={self._buffer_size}", file=sys.stderr)
+
+    def _prefetch_loop(self):
+        """Background thread that pre-reads frames."""
+        while not self._stop_event.is_set():
+            # Wait for work or timeout
+            self._request_event.wait(timeout=0.01)
+            self._request_event.clear()
+
+            if self._stop_event.is_set():
+                break
+
+            # Prefetch frames ahead of target time
+            target = self._target_time
+            with self._buffer_lock:
+                # Clean old frames (more than 1 second behind)
+                old_times = [t for t in self._buffer.keys() if t < target - 1.0]
+                for t in old_times:
+                    del self._buffer[t]
+
+                # Count how many frames we have buffered ahead
+                buffered_ahead = sum(1 for t in self._buffer.keys() if t >= target)
+
+            # Prefetch if buffer not full
+            if buffered_ahead < self._buffer_size:
+                # Find next time to prefetch
+                prefetch_t = target
+                with self._buffer_lock:
+                    existing_times = set(self._buffer.keys())
+                for _ in range(self._buffer_size):
+                    if prefetch_t not in existing_times:
+                        break
+                    prefetch_t += self._frame_time
+
+                # Read the frame (this is the slow part)
+                try:
+                    frame = self._source.read_at(prefetch_t)
+                    with self._buffer_lock:
+                        self._buffer[prefetch_t] = frame
+                except Exception as e:
+                    import sys
+                    print(f"Prefetch error at t={prefetch_t}: {e}", file=sys.stderr)
+
+    def read_at(self, t: float) -> np.ndarray:
+        """Read frame at specific time, using prefetch buffer if available."""
+        self._target_time = t
+        self._request_event.set()  # Wake up prefetch thread
+
+        # Round to frame time for buffer lookup
+        t_key = round(t / self._frame_time) * self._frame_time
+
+        # Check buffer first
+        with self._buffer_lock:
+            if t_key in self._buffer:
+                return self._buffer[t_key]
+            # Also check for close matches (within half frame time)
+            for buf_t, frame in self._buffer.items():
+                if abs(buf_t - t) < self._frame_time * 0.5:
+                    return frame
+
+        # Not in buffer - read directly (blocking)
+        frame = self._source.read_at(t)
+
+        # Store in buffer
+        with self._buffer_lock:
+            self._buffer[t_key] = frame
+
+        return frame
+
+    def read(self) -> np.ndarray:
+        """Read frame (uses last cached or t=0)."""
+        return self.read_at(0)
+
+    def skip(self):
+        """No-op for seek-based reading."""
+        pass
+
+    @property
+    def size(self):
+        return self._source.size
+
+    @property
+    def path(self):
+        return self._source.path
+
+    def close(self):
+        self._stop_event.set()
+        self._request_event.set()  # Wake up thread to exit
+        self._thread.join(timeout=1.0)
+        self._source.close()
+
+
+class AudioAnalyzer:
+    """Audio analyzer for energy and beat detection."""
+
+    def __init__(self, path: str, sample_rate: int = 22050):
+        self.path = Path(path)
+        self.sample_rate = sample_rate
+
+        # Check if file exists
+        if not self.path.exists():
+            raise FileNotFoundError(f"Audio file not found: {self.path}")
+
+        # Load audio via ffmpeg
+        cmd = ["ffmpeg", "-v", "error", "-i", str(self.path),
+               "-f", "f32le", "-ac", "1", "-ar", str(sample_rate), "-"]
+        result = subprocess.run(cmd, capture_output=True)
+        if result.returncode != 0:
+            raise RuntimeError(f"Failed to load audio '{self.path}': {result.stderr.decode()}")
+        self._audio = np.frombuffer(result.stdout, dtype=np.float32)
+        if len(self._audio) == 0:
+            raise RuntimeError(f"Audio file is empty or invalid: {self.path}")
+
+        # Get duration
+        cmd = ["ffprobe", "-v", "quiet", "-print_format", "json",
+               "-show_format", str(self.path)]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            raise RuntimeError(f"Failed to probe audio '{self.path}': {result.stderr}")
+        info = json.loads(result.stdout)
+        self.duration = float(info.get("format", {}).get("duration", 60))
+
+        # Beat detection state
+        self._flux_history = []
+        self._last_beat_time = -1
+        self._beat_count = 0
+        self._last_beat_check_time = -1
+        # Cache beat result for current time (so multiple scans see same result)
+        self._beat_cache_time = -1
+        self._beat_cache_result = False
+
+    def get_energy(self, t: float) -> float:
+        """Get energy level at time t (0-1)."""
+        idx = int(t * self.sample_rate)
+        start = max(0, idx - 512)
+        end = min(len(self._audio), idx + 512)
+        if start >= end:
+            return 0.0
+        return min(1.0, np.sqrt(np.mean(self._audio[start:end] ** 2)) * 3.0)
+
+    def get_beat(self, t: float) -> bool:
+        """Check if there's a beat at time t."""
+        # Return cached result if same time (multiple scans query same frame)
+        if t == self._beat_cache_time:
+            return self._beat_cache_result
+
+        idx = int(t * self.sample_rate)
+        size = 2048
+
+        start, end = max(0, idx - size//2), min(len(self._audio), idx + size//2)
+        if end - start < size/2:
+            self._beat_cache_time = t
+            self._beat_cache_result = False
+            return False
+        curr = self._audio[start:end]
+
+        pstart, pend = max(0, start - 512), max(0, end - 512)
+        if pend <= pstart:
+            self._beat_cache_time = t
+            self._beat_cache_result = False
+            return False
+        prev = self._audio[pstart:pend]
+
+        curr_spec = np.abs(np.fft.rfft(curr * np.hanning(len(curr))))
+        prev_spec = np.abs(np.fft.rfft(prev * np.hanning(len(prev))))
+
+        n = min(len(curr_spec), len(prev_spec))
+        flux = np.sum(np.maximum(0, curr_spec[:n] - prev_spec[:n])) / (n + 1)
+
+        self._flux_history.append((t, flux))
+        if len(self._flux_history) > 50:
+            self._flux_history = self._flux_history[-50:]
+
+        if len(self._flux_history) < 5:
+            self._beat_cache_time = t
+            self._beat_cache_result = False
+            return False
+
+        recent = [f for _, f in self._flux_history[-20:]]
+        threshold = np.mean(recent) + 1.5 * np.std(recent)
+
+        is_beat = flux > threshold and (t - self._last_beat_time) > 0.1
+        if is_beat:
+            self._last_beat_time = t
+            if t > self._last_beat_check_time:
+                self._beat_count += 1
+                self._last_beat_check_time = t
+
+        # Cache result for this time
+        self._beat_cache_time = t
+        self._beat_cache_result = is_beat
+        return is_beat
+
+    def get_beat_count(self, t: float) -> int:
+        """Get cumulative beat count up to time t."""
+        # Ensure beat detection has run up to this time
+        self.get_beat(t)
+        return self._beat_count
+
+
+# === Primitives ===
+
+def prim_make_video_source(path: str, fps: float = 30):
+    """Create a video source from a file path.
+
+    Uses PrefetchingVideoSource if STREAMING_PREFETCH=1 (default).
+    """
+    if PREFETCH_ENABLED:
+        return PrefetchingVideoSource(path, fps)
+    return VideoSource(path, fps)
+
+
+def prim_source_read(source: VideoSource, t: float = None):
+    """Read a frame from a video source."""
+    import sys
+    if t is not None:
+        frame = source.read_at(t)
+        # Debug: show source and time
+        if int(t * 10) % 10 == 0:  # Every second
+            print(f"READ {source.path.name}: t={t:.2f} stream={source._stream_time:.2f}", file=sys.stderr)
+        return frame
+    return source.read()
+
+
+def prim_source_skip(source: VideoSource):
+    """Skip a frame (keep pipe in sync)."""
+    source.skip()
+
+
+def prim_source_size(source: VideoSource):
+    """Get (width, height) of source."""
+    return source.size
+
+
+def prim_make_audio_analyzer(path: str):
+    """Create an audio analyzer from a file path."""
+    return AudioAnalyzer(path)
+
+
+def prim_audio_energy(analyzer: AudioAnalyzer, t: float) -> float:
+    """Get energy level (0-1) at time t."""
+    return analyzer.get_energy(t)
+
+
+def prim_audio_beat(analyzer: AudioAnalyzer, t: float) -> bool:
+    """Check if there's a beat at time t."""
+    return analyzer.get_beat(t)
+
+
+def prim_audio_beat_count(analyzer: AudioAnalyzer, t: float) -> int:
+    """Get cumulative beat count up to time t."""
+    return analyzer.get_beat_count(t)
+
+
+def prim_audio_duration(analyzer: AudioAnalyzer) -> float:
+    """Get audio duration in seconds."""
+    return analyzer.duration
+
+
+# Export primitives
+PRIMITIVES = {
+    # Video source
+    'make-video-source': prim_make_video_source,
+    'source-read': prim_source_read,
+    'source-skip': prim_source_skip,
+    'source-size': prim_source_size,
+
+    # Audio analyzer
+    'make-audio-analyzer': prim_make_audio_analyzer,
+    'audio-energy': prim_audio_energy,
+    'audio-beat': prim_audio_beat,
+    'audio-beat-count': prim_audio_beat_count,
+    'audio-duration': prim_audio_duration,
+}
--- a/l1/sexp_effects/primitive_libs/streaming_gpu.py
+++ b/l1/sexp_effects/primitive_libs/streaming_gpu.py
--- a/l1/sexp_effects/primitive_libs/xector.py
+++ b/l1/sexp_effects/primitive_libs/xector.py