Add fused-pipeline primitive and test for compiled CUDA kernels
This commit is contained in:
@@ -842,5 +842,101 @@ def _get_cpu_primitives():
|
||||
|
||||
PRIMITIVES = _get_cpu_primitives().copy()
|
||||
|
||||
# Try to import fused kernel compiler
|
||||
_FUSED_KERNELS_AVAILABLE = False
|
||||
_compile_frame_pipeline = None
|
||||
try:
|
||||
if GPU_AVAILABLE:
|
||||
from streaming.sexp_to_cuda import compile_frame_pipeline as _compile_frame_pipeline
|
||||
_FUSED_KERNELS_AVAILABLE = True
|
||||
print("[streaming_gpu] Fused CUDA kernel compiler loaded", file=sys.stderr)
|
||||
except ImportError as e:
|
||||
print(f"[streaming_gpu] Fused kernels not available: {e}", file=sys.stderr)
|
||||
|
||||
|
||||
# Fused pipeline cache
|
||||
_FUSED_PIPELINE_CACHE = {}
|
||||
|
||||
|
||||
def prim_fused_pipeline(img, effects_list, **dynamic_params):
|
||||
"""
|
||||
Apply a fused CUDA kernel pipeline to an image.
|
||||
|
||||
This compiles multiple effects into a single CUDA kernel that processes
|
||||
the entire pipeline in one GPU pass, eliminating Python interpreter overhead.
|
||||
|
||||
Args:
|
||||
img: Input image (GPU array or numpy array)
|
||||
effects_list: List of effect dicts like:
|
||||
[{'op': 'rotate', 'angle': 45.0},
|
||||
{'op': 'hue_shift', 'degrees': 90.0},
|
||||
{'op': 'ripple', 'amplitude': 10, ...}]
|
||||
**dynamic_params: Parameters that change per-frame like:
|
||||
rotate_angle=45, ripple_phase=0.5
|
||||
|
||||
Returns:
|
||||
Processed image as GPU array
|
||||
|
||||
Supported ops: rotate, zoom, ripple, invert, hue_shift, brightness
|
||||
"""
|
||||
if not _FUSED_KERNELS_AVAILABLE:
|
||||
# Fallback: apply effects one by one
|
||||
result = img
|
||||
for effect in effects_list:
|
||||
op = effect['op']
|
||||
if op == 'rotate':
|
||||
angle = dynamic_params.get('rotate_angle', effect.get('angle', 0))
|
||||
result = gpu_rotate(result, angle)
|
||||
elif op == 'zoom':
|
||||
amount = dynamic_params.get('zoom_amount', effect.get('amount', 1.0))
|
||||
result = gpu_zoom(result, amount)
|
||||
elif op == 'hue_shift':
|
||||
degrees = effect.get('degrees', 0)
|
||||
result = gpu_hue_shift(result, degrees)
|
||||
elif op == 'ripple':
|
||||
result = gpu_ripple(result,
|
||||
amplitude=effect.get('amplitude', 10),
|
||||
frequency=effect.get('frequency', 8),
|
||||
decay=effect.get('decay', 2),
|
||||
phase=dynamic_params.get('ripple_phase', effect.get('phase', 0)),
|
||||
center_x=effect.get('center_x'),
|
||||
center_y=effect.get('center_y'))
|
||||
elif op == 'brightness':
|
||||
factor = effect.get('factor', 1.0)
|
||||
result = gpu_contrast(result, factor, 0)
|
||||
elif op == 'invert':
|
||||
result = gpu_invert(result)
|
||||
return result
|
||||
|
||||
# Get image dimensions
|
||||
if hasattr(img, 'shape'):
|
||||
h, w = img.shape[:2]
|
||||
else:
|
||||
raise ValueError("Image must have shape attribute")
|
||||
|
||||
# Create cache key from effects
|
||||
import hashlib
|
||||
ops_key = str([(e['op'], {k:v for k,v in e.items() if k != 'src2'}) for e in effects_list])
|
||||
cache_key = f"{w}x{h}_{hashlib.md5(ops_key.encode()).hexdigest()}"
|
||||
|
||||
# Compile or get cached pipeline
|
||||
if cache_key not in _FUSED_PIPELINE_CACHE:
|
||||
_FUSED_PIPELINE_CACHE[cache_key] = _compile_frame_pipeline(effects_list, w, h)
|
||||
|
||||
pipeline = _FUSED_PIPELINE_CACHE[cache_key]
|
||||
|
||||
# Ensure image is on GPU and uint8
|
||||
if hasattr(img, '__cuda_array_interface__'):
|
||||
gpu_img = img
|
||||
elif GPU_AVAILABLE:
|
||||
gpu_img = cp.asarray(img)
|
||||
else:
|
||||
gpu_img = img
|
||||
|
||||
# Run the fused pipeline
|
||||
return pipeline(gpu_img, **dynamic_params)
|
||||
|
||||
|
||||
# Add GPU-specific primitives
|
||||
PRIMITIVES['fused-pipeline'] = prim_fused_pipeline
|
||||
# (The GPU video source will be added by create_cid_primitives in the task)
|
||||
|
||||
Reference in New Issue
Block a user