Add IPFS HLS streaming and GPU optimizations

- Add IPFSHLSOutput class that uploads segments to IPFS as they're created
- Update streaming task to use IPFS HLS output for distributed streaming
- Add /ipfs-stream endpoint to get IPFS playlist URL
- Update /stream endpoint to redirect to IPFS when available
- Add GPU persistence mode (STREAMING_GPU_PERSIST=1) to keep frames on GPU
- Add hardware video decoding (NVDEC) support for faster video processing
- Add GPU-accelerated primitive libraries: blending_gpu, color_ops_gpu, geometry_gpu
- Add streaming_gpu module with GPUFrame class for tracking CPU/GPU data location
- Add Dockerfile.gpu for building GPU-enabled worker image

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
giles
2026-02-03 20:23:16 +00:00
parent 5bc655f8c8
commit 86830019ad
24 changed files with 4025 additions and 96 deletions

44
Dockerfile.gpu Normal file
View File

@@ -0,0 +1,44 @@
# GPU-enabled worker image
# Based on NVIDIA CUDA with Python for CuPy support
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
WORKDIR /app
# Install Python 3.11 and system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
python3.11 \
python3.11-venv \
python3-pip \
git \
ffmpeg \
&& rm -rf /var/lib/apt/lists/* \
&& ln -sf /usr/bin/python3.11 /usr/bin/python3 \
&& ln -sf /usr/bin/python3 /usr/bin/python
# Upgrade pip
RUN python3 -m pip install --upgrade pip
# Install CPU dependencies first
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Install GPU-specific dependencies (CuPy for CUDA 12.x)
RUN pip install --no-cache-dir cupy-cuda12x
# Copy application
COPY . .
# Clone effects repo
RUN git clone https://git.rose-ash.com/art-dag/effects.git /app/artdag-effects
# Create cache directory
RUN mkdir -p /data/cache
ENV PYTHONUNBUFFERED=1
ENV PYTHONDONTWRITEBYTECODE=1
ENV EFFECTS_PATH=/app/artdag-effects
ENV PYTHONPATH=/app
# Default command runs celery worker
CMD ["celery", "-A", "celery_app", "worker", "--loglevel=info", "-E", "-Q", "gpu,celery"]

View File

@@ -227,8 +227,9 @@ async def create_stream_run(
logger.warning(f"Failed to store recipe in cache: {e}")
# Continue anyway - run will still work, just won't appear in /recipes
# Submit Celery task
task = run_stream.delay(
# Submit Celery task to GPU queue for hardware-accelerated rendering
task = run_stream.apply_async(
kwargs=dict(
run_id=run_id,
recipe_sexp=request.recipe_sexp,
output_name=request.output_name,
@@ -237,6 +238,8 @@ async def create_stream_run(
actor_id=actor_id,
sources_sexp=request.sources_sexp,
audio_sexp=request.audio_sexp,
),
queue='gpu',
)
# Store in database for durability
@@ -396,7 +399,7 @@ async def get_run(
artifacts = []
output_media_type = None
if run.get("output_cid"):
# Detect media type using magic bytes
# Detect media type using magic bytes, fall back to database item_type
output_cid = run["output_cid"]
media_type = None
try:
@@ -408,6 +411,16 @@ async def get_run(
output_media_type = media_type
except Exception:
pass
# Fall back to database item_type if local detection failed
if not media_type:
try:
import database
item_types = await database.get_item_types(output_cid, run.get("actor_id"))
if item_types:
media_type = type_to_mime(item_types[0].get("type"))
output_media_type = media_type
except Exception:
pass
artifacts.append({
"cid": output_cid,
"step_name": "Output",
@@ -964,17 +977,43 @@ async def stream_run_output(
):
"""Stream the video output of a running render.
Returns the partial video file as it's being written,
allowing live preview of the render progress.
For IPFS HLS streams, redirects to the IPFS gateway playlist.
For local HLS streams, redirects to the m3u8 playlist.
For legacy MP4 streams, returns the file directly.
"""
from fastapi.responses import StreamingResponse, FileResponse
from fastapi.responses import StreamingResponse, FileResponse, RedirectResponse
from pathlib import Path
import os
import database
from celery_app import app as celery_app
await database.init_db()
# Check for IPFS HLS streaming first (distributed P2P streaming)
pending = await database.get_pending_run(run_id)
if pending and pending.get("celery_task_id"):
task_id = pending["celery_task_id"]
result = celery_app.AsyncResult(task_id)
if result.ready() and isinstance(result.result, dict):
ipfs_playlist_url = result.result.get("ipfs_playlist_url")
if ipfs_playlist_url:
logger.info(f"Redirecting to IPFS stream: {ipfs_playlist_url}")
return RedirectResponse(url=ipfs_playlist_url, status_code=302)
# Check for the streaming output file in the shared cache
cache_dir = os.environ.get("CACHE_DIR", "/data/cache")
stream_path = Path(cache_dir) / "streaming" / run_id / "output.mp4"
stream_dir = Path(cache_dir) / "streaming" / run_id
# Check for local HLS output
hls_playlist = stream_dir / "stream.m3u8"
if hls_playlist.exists():
# Redirect to the HLS playlist endpoint
return RedirectResponse(
url=f"/runs/{run_id}/hls/stream.m3u8",
status_code=302
)
# Fall back to legacy MP4 streaming
stream_path = stream_dir / "output.mp4"
if not stream_path.exists():
raise HTTPException(404, "Stream not available yet")
@@ -982,7 +1021,6 @@ async def stream_run_output(
if file_size == 0:
raise HTTPException(404, "Stream not ready")
# Return the file with headers that allow streaming of growing file
return FileResponse(
path=str(stream_path),
media_type="video/mp4",
@@ -992,3 +1030,139 @@ async def stream_run_output(
"X-Content-Size": str(file_size),
}
)
@router.get("/{run_id}/hls/{filename:path}")
async def serve_hls_content(
run_id: str,
filename: str,
request: Request,
):
"""Serve HLS playlist and segments for live streaming.
Serves stream.m3u8 (playlist) and segment_*.ts files.
The playlist updates as new segments are rendered.
If files aren't found locally, proxies to the GPU worker (if configured).
"""
from fastapi.responses import FileResponse, StreamingResponse
from pathlib import Path
import os
import httpx
cache_dir = os.environ.get("CACHE_DIR", "/data/cache")
stream_dir = Path(cache_dir) / "streaming" / run_id
file_path = stream_dir / filename
# Security: ensure we're only serving files within stream_dir
try:
file_path_resolved = file_path.resolve()
stream_dir_resolved = stream_dir.resolve()
if stream_dir.exists() and not str(file_path_resolved).startswith(str(stream_dir_resolved)):
raise HTTPException(403, "Invalid path")
except Exception:
pass # Allow proxy fallback
# Determine content type
if filename.endswith(".m3u8"):
media_type = "application/vnd.apple.mpegurl"
headers = {
"Cache-Control": "no-cache, no-store, must-revalidate",
"Access-Control-Allow-Origin": "*",
}
elif filename.endswith(".ts"):
media_type = "video/mp2t"
headers = {
"Cache-Control": "public, max-age=3600",
"Access-Control-Allow-Origin": "*",
}
else:
raise HTTPException(400, "Invalid file type")
# Try local file first
if file_path.exists():
return FileResponse(
path=str(file_path),
media_type=media_type,
headers=headers,
)
# Fallback: proxy to GPU worker if configured
gpu_worker_url = os.environ.get("GPU_WORKER_STREAM_URL")
if gpu_worker_url:
# Proxy request to GPU worker
proxy_url = f"{gpu_worker_url}/{run_id}/{filename}"
try:
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.get(proxy_url)
if resp.status_code == 200:
return StreamingResponse(
content=iter([resp.content]),
media_type=media_type,
headers=headers,
)
except Exception as e:
logger.warning(f"GPU worker proxy failed: {e}")
raise HTTPException(404, f"File not found: {filename}")
@router.get("/{run_id}/ipfs-stream")
async def get_ipfs_stream_info(run_id: str, request: Request):
"""Get IPFS streaming info for a run.
Returns the IPFS playlist URL and segment info if available.
This allows clients to stream directly from IPFS gateways.
"""
from celery_app import app as celery_app
import database
import os
await database.init_db()
# Try to get pending run to find the Celery task ID
pending = await database.get_pending_run(run_id)
if not pending:
# Try completed runs
run = await database.get_run_cache(run_id)
if not run:
raise HTTPException(404, "Run not found")
# For completed runs, check if we have IPFS info stored
ipfs_cid = run.get("ipfs_cid")
if ipfs_cid:
gateway = os.environ.get("IPFS_GATEWAY_URL", "https://ipfs.io/ipfs")
return {
"run_id": run_id,
"status": "completed",
"ipfs_video_url": f"{gateway}/{ipfs_cid}",
}
raise HTTPException(404, "No IPFS stream info available")
task_id = pending.get("celery_task_id")
if not task_id:
raise HTTPException(404, "No task ID for this run")
# Get the Celery task result
result = celery_app.AsyncResult(task_id)
if result.ready():
# Task is complete - check the result for IPFS playlist info
task_result = result.result
if isinstance(task_result, dict):
ipfs_playlist_cid = task_result.get("ipfs_playlist_cid")
ipfs_playlist_url = task_result.get("ipfs_playlist_url")
if ipfs_playlist_url:
return {
"run_id": run_id,
"status": "completed",
"ipfs_playlist_cid": ipfs_playlist_cid,
"ipfs_playlist_url": ipfs_playlist_url,
"segment_count": task_result.get("ipfs_segment_count", 0),
}
# Task is still running or no IPFS info available
return {
"run_id": run_id,
"status": pending.get("status", "pending"),
"message": "IPFS streaming info not yet available"
}

View File

@@ -100,30 +100,52 @@ class CacheService:
async def get_cache_item(self, cid: str, actor_id: str = None) -> Optional[Dict[str, Any]]:
"""Get cached item with full metadata for display."""
# Check if content exists
if not self.cache.has_content(cid):
return None
path = self.cache.get_by_cid(cid)
if not path or not path.exists():
return None
# Get metadata from database
# Get metadata from database first
meta = await self.db.load_item_metadata(cid, actor_id)
cache_item = await self.db.get_cache_item(cid)
# Check if content exists locally
path = self.cache.get_by_cid(cid) if self.cache.has_content(cid) else None
if path and path.exists():
# Local file exists - detect type from file
media_type = detect_media_type(path)
mime_type = get_mime_type(path)
size = path.stat().st_size
else:
# File not local - check database for type info
# Try to get type from item_types table
media_type = "unknown"
mime_type = "application/octet-stream"
size = 0
if actor_id:
try:
item_types = await self.db.get_item_types(cid, actor_id)
if item_types:
media_type = item_types[0].get("type", "unknown")
if media_type == "video":
mime_type = "video/mp4"
elif media_type == "image":
mime_type = "image/png"
elif media_type == "audio":
mime_type = "audio/mpeg"
except Exception:
pass
# If no local path but we have IPFS CID, content is available remotely
if not cache_item:
return None
result = {
"cid": cid,
"path": str(path),
"path": str(path) if path else None,
"media_type": media_type,
"mime_type": mime_type,
"size": size,
"ipfs_cid": cache_item.get("ipfs_cid") if cache_item else None,
"meta": meta,
"remote_only": path is None or not path.exists(),
}
# Unpack meta fields to top level for template convenience

View File

@@ -13,17 +13,32 @@
<!-- Preview -->
<div class="bg-gray-800 rounded-lg border border-gray-700 mb-6 overflow-hidden">
{% if cache.mime_type and cache.mime_type.startswith('image/') %}
{% if cache.remote_only and cache.ipfs_cid %}
<img src="https://ipfs.io/ipfs/{{ cache.ipfs_cid }}" alt=""
class="w-full max-h-96 object-contain bg-gray-900">
{% else %}
<img src="/cache/{{ cache.cid }}/raw" alt=""
class="w-full max-h-96 object-contain bg-gray-900">
{% endif %}
{% elif cache.mime_type and cache.mime_type.startswith('video/') %}
{% if cache.remote_only and cache.ipfs_cid %}
<video src="https://ipfs.io/ipfs/{{ cache.ipfs_cid }}" controls
class="w-full max-h-96 bg-gray-900">
</video>
{% else %}
<video src="/cache/{{ cache.cid }}/raw" controls
class="w-full max-h-96 bg-gray-900">
</video>
{% endif %}
{% elif cache.mime_type and cache.mime_type.startswith('audio/') %}
<div class="p-8 bg-gray-900">
{% if cache.remote_only and cache.ipfs_cid %}
<audio src="https://ipfs.io/ipfs/{{ cache.ipfs_cid }}" controls class="w-full"></audio>
{% else %}
<audio src="/cache/{{ cache.cid }}/raw" controls class="w-full"></audio>
{% endif %}
</div>
{% elif cache.mime_type == 'application/json' %}

View File

@@ -7,6 +7,7 @@
<script src="https://cdnjs.cloudflare.com/ajax/libs/cytoscape/3.23.0/cytoscape.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/dagre/0.8.5/dagre.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/cytoscape-dagre@2.5.0/cytoscape-dagre.min.js"></script>
<script src="https://cdn.jsdelivr.net/npm/hls.js@1.4.12/dist/hls.min.js"></script>
{% endblock %}
{% block content %}
@@ -73,6 +74,174 @@
</div>
</div>
<!-- Live Stream Player (shown during rendering) -->
{% if run.status == 'rendering' %}
<div id="live-stream-container" class="mb-6 bg-gray-800 rounded-lg p-4">
<div class="flex items-center justify-between mb-4">
<h3 class="text-lg font-semibold flex items-center">
<span class="w-3 h-3 bg-red-500 rounded-full mr-2 animate-pulse"></span>
Live Preview
</h3>
<div id="stream-status" class="text-sm text-gray-400">Connecting...</div>
</div>
<div class="relative bg-black rounded-lg overflow-hidden" style="aspect-ratio: 16/9;">
<video id="live-video" class="w-full h-full" controls autoplay muted playsinline></video>
<div id="stream-loading" class="absolute inset-0 flex items-center justify-center bg-gray-900/80">
<div class="text-center">
<div class="animate-spin w-8 h-8 border-2 border-blue-500 border-t-transparent rounded-full mx-auto mb-2"></div>
<div class="text-gray-400">Waiting for stream...</div>
</div>
</div>
</div>
<div class="mt-2 text-xs text-gray-500">
Stream URL: <code class="bg-gray-900 px-1 rounded">/runs/{{ run.run_id }}/hls/stream.m3u8</code>
</div>
</div>
<script>
(function() {
const video = document.getElementById('live-video');
const statusEl = document.getElementById('stream-status');
const loadingEl = document.getElementById('stream-loading');
const hlsUrl = '/runs/{{ run.run_id }}/hls/stream.m3u8';
let hls = null;
let retryCount = 0;
const maxRetries = 120; // Try for up to 4 minutes
let segmentsLoaded = 0;
function initHls() {
if (Hls.isSupported()) {
hls = new Hls({
// Stability over low latency - buffer more for smoother playback
liveSyncDurationCount: 4, // Stay 4 segments behind live edge
liveMaxLatencyDurationCount: 8, // Max 8 segments behind
liveDurationInfinity: true, // Treat as infinite live stream
// Large buffers to absorb rendering speed variations
maxBufferLength: 60, // Buffer up to 60s ahead
maxMaxBufferLength: 120, // Allow even more if needed
maxBufferSize: 60 * 1024 * 1024, // 60MB buffer
maxBufferHole: 0.5, // Tolerate small gaps
// Back buffer for smooth seeking
backBufferLength: 30,
// Playlist reload settings
manifestLoadingTimeOut: 10000,
manifestLoadingMaxRetry: 4,
levelLoadingTimeOut: 10000,
levelLoadingMaxRetry: 4,
fragLoadingTimeOut: 20000,
fragLoadingMaxRetry: 6,
// Start at lowest quality for faster start
startLevel: 0,
// Enable smooth level switching
abrEwmaDefaultEstimate: 500000,
});
hls.on(Hls.Events.MANIFEST_PARSED, function(event, data) {
loadingEl.classList.add('hidden');
statusEl.textContent = 'Buffering...';
statusEl.classList.remove('text-gray-400');
statusEl.classList.add('text-yellow-400');
video.play().catch(() => {});
});
hls.on(Hls.Events.FRAG_LOADED, function(event, data) {
retryCount = 0;
segmentsLoaded++;
statusEl.textContent = `Playing (${segmentsLoaded} segments)`;
statusEl.classList.remove('text-yellow-400', 'text-gray-400');
statusEl.classList.add('text-green-400');
});
hls.on(Hls.Events.BUFFER_APPENDED, function() {
// Hide loading once we have buffered content
loadingEl.classList.add('hidden');
});
hls.on(Hls.Events.ERROR, function(event, data) {
console.log('HLS error:', data.type, data.details, data.fatal);
if (data.fatal) {
switch (data.type) {
case Hls.ErrorTypes.NETWORK_ERROR:
if (retryCount < maxRetries) {
retryCount++;
statusEl.textContent = `Waiting for stream... (${retryCount})`;
statusEl.classList.remove('text-green-400');
statusEl.classList.add('text-yellow-400');
// Exponential backoff with jitter
const delay = Math.min(1000 * Math.pow(1.5, Math.min(retryCount, 6)), 10000);
setTimeout(() => {
hls.loadSource(hlsUrl);
}, delay + Math.random() * 1000);
} else {
statusEl.textContent = 'Stream unavailable';
statusEl.classList.add('text-red-400');
}
break;
case Hls.ErrorTypes.MEDIA_ERROR:
console.log('Media error, attempting recovery');
hls.recoverMediaError();
break;
default:
statusEl.textContent = 'Stream error';
statusEl.classList.add('text-red-400');
break;
}
} else {
// Non-fatal error - just log it
if (data.details === 'bufferStalledError') {
statusEl.textContent = 'Buffering...';
statusEl.classList.remove('text-green-400');
statusEl.classList.add('text-yellow-400');
}
}
});
// Handle video stalls
video.addEventListener('waiting', function() {
statusEl.textContent = 'Buffering...';
statusEl.classList.remove('text-green-400');
statusEl.classList.add('text-yellow-400');
});
video.addEventListener('playing', function() {
statusEl.textContent = `Playing (${segmentsLoaded} segments)`;
statusEl.classList.remove('text-yellow-400');
statusEl.classList.add('text-green-400');
});
hls.loadSource(hlsUrl);
hls.attachMedia(video);
} else if (video.canPlayType('application/vnd.apple.mpegurl')) {
// Native HLS support (Safari)
video.src = hlsUrl;
video.addEventListener('loadedmetadata', function() {
loadingEl.classList.add('hidden');
statusEl.textContent = 'Playing';
video.play().catch(() => {});
});
} else {
statusEl.textContent = 'HLS not supported';
statusEl.classList.add('text-red-400');
}
}
// Start trying to connect
initHls();
// Cleanup on page unload
window.addEventListener('beforeunload', function() {
if (hls) hls.destroy();
});
})();
</script>
{% endif %}
<!-- Tabs -->
<div class="border-b border-gray-700 mb-6">
<nav class="flex space-x-8">

View File

@@ -1811,3 +1811,18 @@ async def delete_friendly_name(actor_id: str, cid: str) -> bool:
actor_id, cid
)
return "DELETE 1" in result
async def update_friendly_name_cid(actor_id: str, old_cid: str, new_cid: str) -> bool:
"""
Update a friendly name's CID (used when IPFS upload completes).
This updates the CID from a local SHA256 hash to an IPFS CID,
ensuring assets can be fetched by remote workers via IPFS.
"""
async with pool.acquire() as conn:
result = await conn.execute(
"UPDATE friendly_names SET cid = $3 WHERE actor_id = $1 AND cid = $2",
actor_id, old_cid, new_cid
)
return "UPDATE 1" in result

View File

@@ -3,6 +3,10 @@ version: "3.8"
services:
redis:
image: redis:7-alpine
ports:
- target: 6379
published: 16379
mode: host # Bypass swarm routing mesh
volumes:
- redis_data:/data
networks:
@@ -11,6 +15,9 @@ services:
replicas: 1
restart_policy:
condition: on-failure
placement:
constraints:
- node.labels.gpu != true
postgres:
image: postgres:16-alpine
@@ -18,6 +25,10 @@ services:
- POSTGRES_USER=artdag
- POSTGRES_PASSWORD=artdag
- POSTGRES_DB=artdag
ports:
- target: 5432
published: 15432
mode: host # Expose for GPU worker on different VPC
volumes:
- postgres_data:/var/lib/postgresql/data
networks:
@@ -26,12 +37,18 @@ services:
replicas: 1
restart_policy:
condition: on-failure
placement:
constraints:
- node.labels.gpu != true
ipfs:
image: ipfs/kubo:latest
ports:
- "4001:4001" # Swarm TCP
- "4001:4001/udp" # Swarm UDP
- target: 5001
published: 15001
mode: host # API port for GPU worker on different VPC
volumes:
- ipfs_data:/data/ipfs
- l1_cache:/data/cache:ro # Read-only access to cache for adding files
@@ -42,6 +59,9 @@ services:
replicas: 1
restart_policy:
condition: on-failure
placement:
constraints:
- node.labels.gpu != true
l1-server:
image: git.rose-ash.com/art-dag/l1-server:latest
@@ -75,6 +95,9 @@ services:
replicas: 1
restart_policy:
condition: on-failure
placement:
constraints:
- node.labels.gpu != true
l1-worker:
image: git.rose-ash.com/art-dag/l1-server:latest
@@ -102,6 +125,9 @@ services:
replicas: 2
restart_policy:
condition: on-failure
placement:
constraints:
- node.labels.gpu != true
flower:
image: mher/flower:2.0
@@ -118,12 +144,60 @@ services:
replicas: 1
restart_policy:
condition: on-failure
placement:
constraints:
- node.labels.gpu != true
# GPU worker for streaming/rendering tasks
# Build: docker build -f Dockerfile.gpu -t git.rose-ash.com/art-dag/l1-gpu-server:latest .
# Requires: docker node update --label-add gpu=true <gpu-node-name>
l1-gpu-worker:
image: git.rose-ash.com/art-dag/l1-gpu-server:latest
# For local dev, uncomment to build from Dockerfile.gpu:
# build:
# context: .
# dockerfile: Dockerfile.gpu
command: sh -c "cd /app && celery -A celery_app worker --loglevel=info -E -Q gpu,celery"
environment:
# GPU node is on different VPC - use public IPs for cross-node communication
- REDIS_URL=redis://138.68.142.139:16379/5
- DATABASE_URL=postgresql://artdag:artdag@138.68.142.139:15432/artdag
# Connect to shared IPFS node on CPU (via public IP)
- IPFS_API=/ip4/138.68.142.139/tcp/15001
# Gateway fallback for resilience
- IPFS_GATEWAYS=https://ipfs.io,https://cloudflare-ipfs.com,https://dweb.link
# Local cache is ephemeral (tmpfs or local volume)
- CACHE_DIR=/data/cache
- C_FORCE_ROOT=true
- ARTDAG_CLUSTER_KEY=${ARTDAG_CLUSTER_KEY:-}
# GPU acceleration settings
- NVIDIA_VISIBLE_DEVICES=all
# Keep frames on GPU between operations for maximum performance
- STREAMING_GPU_PERSIST=1
volumes:
# Local cache - ephemeral, just for working files
- gpu_cache:/data/cache
# Note: No source mount - GPU worker uses code from image
depends_on:
- redis
- postgres
- ipfs
networks:
- celery
deploy:
replicas: 1
restart_policy:
condition: on-failure
placement:
constraints:
- node.labels.gpu == true
volumes:
redis_data:
postgres_data:
ipfs_data:
l1_cache:
gpu_cache: # Ephemeral cache for GPU workers
networks:
celery:

View File

@@ -22,6 +22,16 @@ IPFS_API = os.getenv("IPFS_API", "/ip4/127.0.0.1/tcp/5001")
# Connection timeout in seconds (increased for large files)
IPFS_TIMEOUT = int(os.getenv("IPFS_TIMEOUT", "120"))
# IPFS gateway URLs for fallback when local node doesn't have content
# Comma-separated list of gateway URLs (without /ipfs/ suffix)
IPFS_GATEWAYS = [g.strip() for g in os.getenv(
"IPFS_GATEWAYS",
"https://ipfs.io,https://cloudflare-ipfs.com,https://dweb.link"
).split(",") if g.strip()]
# Gateway timeout (shorter than API timeout for faster fallback)
GATEWAY_TIMEOUT = int(os.getenv("GATEWAY_TIMEOUT", "30"))
def _multiaddr_to_url(multiaddr: str) -> str:
"""Convert IPFS multiaddr to HTTP URL."""
@@ -165,16 +175,50 @@ def get_file(cid: str, dest_path: Union[Path, str]) -> bool:
return False
def get_bytes(cid: str) -> Optional[bytes]:
def get_bytes_from_gateway(cid: str) -> Optional[bytes]:
"""
Retrieve bytes data from IPFS.
Retrieve bytes from IPFS via public gateways (fallback).
Tries each configured gateway in order until one succeeds.
Args:
cid: IPFS CID to retrieve
Returns:
File content as bytes or None if all gateways fail
"""
for gateway in IPFS_GATEWAYS:
try:
url = f"{gateway}/ipfs/{cid}"
logger.info(f"Trying gateway: {url}")
response = requests.get(url, timeout=GATEWAY_TIMEOUT)
response.raise_for_status()
data = response.content
logger.info(f"Retrieved from gateway {gateway}: {cid} ({len(data)} bytes)")
return data
except Exception as e:
logger.warning(f"Gateway {gateway} failed for {cid}: {e}")
continue
logger.error(f"All gateways failed for {cid}")
return None
def get_bytes(cid: str, use_gateway_fallback: bool = True) -> Optional[bytes]:
"""
Retrieve bytes data from IPFS.
Tries local IPFS node first, then falls back to public gateways
if configured and use_gateway_fallback is True.
Args:
cid: IPFS CID to retrieve
use_gateway_fallback: If True, try public gateways on local failure
Returns:
File content as bytes or None on failure
"""
# Try local IPFS node first
try:
url = f"{IPFS_BASE_URL}/api/v0/cat"
params = {"arg": cid}
@@ -186,6 +230,13 @@ def get_bytes(cid: str) -> Optional[bytes]:
logger.info(f"Retrieved from IPFS: {cid} ({len(data)} bytes)")
return data
except Exception as e:
logger.warning(f"Local IPFS failed for {cid}: {e}")
# Try gateway fallback
if use_gateway_fallback and IPFS_GATEWAYS:
logger.info(f"Trying gateway fallback for {cid}")
return get_bytes_from_gateway(cid)
logger.error(f"Failed to get bytes from IPFS: {e}")
return None

77
scripts/cloud-init-gpu.sh Normal file
View File

@@ -0,0 +1,77 @@
#!/bin/bash
# Cloud-init startup script for GPU droplet (RTX 6000 Ada, etc.)
# Paste this into DigitalOcean "User data" field when creating droplet
set -e
export DEBIAN_FRONTEND=noninteractive
exec > /var/log/artdag-setup.log 2>&1
echo "=== ArtDAG GPU Setup Started $(date) ==="
# Update system (non-interactive, keep existing configs)
apt-get update
apt-get -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" upgrade
# Install essentials
apt-get install -y \
python3 python3-venv python3-pip \
git curl wget \
ffmpeg \
vulkan-tools \
build-essential
# Create venv
VENV_DIR="/opt/artdag-gpu"
python3 -m venv "$VENV_DIR"
source "$VENV_DIR/bin/activate"
# Install Python packages
pip install --upgrade pip
pip install \
numpy \
opencv-python-headless \
wgpu \
httpx \
pyyaml \
celery[redis] \
fastapi \
uvicorn \
asyncpg
# Create code directory
mkdir -p "$VENV_DIR/celery/sexp_effects/effects"
mkdir -p "$VENV_DIR/celery/sexp_effects/primitive_libs"
mkdir -p "$VENV_DIR/celery/streaming"
# Add SSH key for easier access (optional - add your key here)
# echo "ssh-ed25519 AAAA... your-key" >> /root/.ssh/authorized_keys
# Test GPU
echo "=== GPU Info ==="
nvidia-smi || echo "nvidia-smi not available yet"
echo "=== NVENC Check ==="
ffmpeg -encoders 2>/dev/null | grep -E "nvenc|cuda" || echo "NVENC not detected"
echo "=== wgpu Check ==="
"$VENV_DIR/bin/python3" -c "
import wgpu
try:
adapter = wgpu.gpu.request_adapter_sync(power_preference='high-performance')
print(f'GPU: {adapter.info}')
except Exception as e:
print(f'wgpu error: {e}')
" || echo "wgpu test failed"
# Add environment setup
cat >> /etc/profile.d/artdag-gpu.sh << 'ENVEOF'
export WGPU_BACKEND_TYPE=Vulkan
export PATH="/opt/artdag-gpu/bin:$PATH"
ENVEOF
# Mark setup complete
touch /opt/artdag-gpu/.setup-complete
echo "=== Setup Complete $(date) ==="
echo "Venv: /opt/artdag-gpu"
echo "Activate: source /opt/artdag-gpu/bin/activate"
echo "Vulkan: export WGPU_BACKEND_TYPE=Vulkan"

51
scripts/deploy-to-gpu.sh Executable file
View File

@@ -0,0 +1,51 @@
#!/bin/bash
# Deploy art-dag GPU code to a remote droplet
# Usage: ./deploy-to-gpu.sh <droplet-ip>
set -e
if [ -z "$1" ]; then
echo "Usage: $0 <droplet-ip>"
echo "Example: $0 159.223.7.100"
exit 1
fi
DROPLET_IP="$1"
REMOTE_DIR="/opt/artdag-gpu/celery"
LOCAL_DIR="$(dirname "$0")/.."
echo "=== Deploying to $DROPLET_IP ==="
# Create remote directory
echo "[1/4] Creating remote directory..."
ssh "root@$DROPLET_IP" "mkdir -p $REMOTE_DIR/sexp_effects $REMOTE_DIR/streaming $REMOTE_DIR/scripts"
# Copy core files
echo "[2/4] Copying core files..."
scp "$LOCAL_DIR/sexp_effects/wgsl_compiler.py" "root@$DROPLET_IP:$REMOTE_DIR/sexp_effects/"
scp "$LOCAL_DIR/sexp_effects/parser.py" "root@$DROPLET_IP:$REMOTE_DIR/sexp_effects/"
scp "$LOCAL_DIR/sexp_effects/interpreter.py" "root@$DROPLET_IP:$REMOTE_DIR/sexp_effects/"
scp "$LOCAL_DIR/sexp_effects/__init__.py" "root@$DROPLET_IP:$REMOTE_DIR/sexp_effects/"
scp "$LOCAL_DIR/streaming/backends.py" "root@$DROPLET_IP:$REMOTE_DIR/streaming/"
# Copy effects
echo "[3/4] Copying effects..."
ssh "root@$DROPLET_IP" "mkdir -p $REMOTE_DIR/sexp_effects/effects $REMOTE_DIR/sexp_effects/primitive_libs"
scp -r "$LOCAL_DIR/sexp_effects/effects/"*.sexp "root@$DROPLET_IP:$REMOTE_DIR/sexp_effects/effects/" 2>/dev/null || true
scp -r "$LOCAL_DIR/sexp_effects/primitive_libs/"*.py "root@$DROPLET_IP:$REMOTE_DIR/sexp_effects/primitive_libs/" 2>/dev/null || true
# Test
echo "[4/4] Testing deployment..."
ssh "root@$DROPLET_IP" "cd $REMOTE_DIR && /opt/artdag-gpu/bin/python3 -c '
import sys
sys.path.insert(0, \".\")
from sexp_effects.wgsl_compiler import compile_effect_file
result = compile_effect_file(\"sexp_effects/effects/invert.sexp\")
print(f\"Compiled effect: {result.name}\")
print(\"Deployment OK\")
'" || echo "Test failed - may need to run setup script first"
echo ""
echo "=== Deployment complete ==="
echo "SSH: ssh root@$DROPLET_IP"
echo "Test: ssh root@$DROPLET_IP 'cd $REMOTE_DIR && /opt/artdag-gpu/bin/python3 -c \"from streaming.backends import get_backend; b=get_backend(\\\"wgpu\\\"); print(b)\"'"

108
scripts/setup-gpu-droplet.sh Executable file
View File

@@ -0,0 +1,108 @@
#!/bin/bash
# Setup script for GPU droplet with NVENC support
# Run as root on a fresh Ubuntu droplet with NVIDIA GPU
set -e
echo "=== ArtDAG GPU Droplet Setup ==="
# 1. System updates
echo "[1/7] Updating system..."
apt-get update
apt-get upgrade -y
# 2. Install NVIDIA drivers (if not already installed)
echo "[2/7] Checking NVIDIA drivers..."
if ! command -v nvidia-smi &> /dev/null; then
echo "Installing NVIDIA drivers..."
apt-get install -y nvidia-driver-535 nvidia-utils-535
echo "NVIDIA drivers installed. Reboot required."
echo "After reboot, run this script again."
exit 0
fi
nvidia-smi
echo "NVIDIA drivers OK"
# 3. Install FFmpeg with NVENC support
echo "[3/7] Installing FFmpeg with NVENC..."
apt-get install -y ffmpeg
# Verify NVENC
if ffmpeg -encoders 2>/dev/null | grep -q nvenc; then
echo "NVENC available:"
ffmpeg -encoders 2>/dev/null | grep nvenc
else
echo "WARNING: NVENC not available. GPU may not support hardware encoding."
fi
# 4. Install Python and create venv
echo "[4/7] Setting up Python environment..."
apt-get install -y python3 python3-venv python3-pip git
VENV_DIR="/opt/artdag-gpu"
python3 -m venv "$VENV_DIR"
source "$VENV_DIR/bin/activate"
# 5. Install Python dependencies
echo "[5/7] Installing Python packages..."
pip install --upgrade pip
pip install \
numpy \
opencv-python-headless \
wgpu \
httpx \
pyyaml \
celery[redis] \
fastapi \
uvicorn
# 6. Clone/update art-dag code
echo "[6/7] Setting up art-dag code..."
ARTDAG_DIR="$VENV_DIR/celery"
if [ -d "$ARTDAG_DIR" ]; then
echo "Updating existing code..."
cd "$ARTDAG_DIR"
git pull || true
else
echo "Cloning art-dag..."
git clone https://git.rose-ash.com/art-dag/celery.git "$ARTDAG_DIR" || {
echo "Git clone failed. You may need to copy code manually."
}
fi
# 7. Test GPU compute
echo "[7/7] Testing GPU compute..."
"$VENV_DIR/bin/python3" << 'PYTEST'
import sys
try:
import wgpu
adapter = wgpu.gpu.request_adapter_sync(power_preference="high-performance")
print(f"GPU Adapter: {adapter.info.get('device', 'unknown')}")
device = adapter.request_device_sync()
print("wgpu device created successfully")
# Check for NVENC via FFmpeg
import subprocess
result = subprocess.run(['ffmpeg', '-encoders'], capture_output=True, text=True)
if 'h264_nvenc' in result.stdout:
print("NVENC H.264 encoder: AVAILABLE")
else:
print("NVENC H.264 encoder: NOT AVAILABLE")
if 'hevc_nvenc' in result.stdout:
print("NVENC HEVC encoder: AVAILABLE")
else:
print("NVENC HEVC encoder: NOT AVAILABLE")
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
PYTEST
echo ""
echo "=== Setup Complete ==="
echo "Venv: $VENV_DIR"
echo "Code: $ARTDAG_DIR"
echo ""
echo "To activate: source $VENV_DIR/bin/activate"
echo "To test: cd $ARTDAG_DIR && python -c 'from streaming.backends import get_backend; print(get_backend(\"wgpu\"))'"

View File

@@ -385,9 +385,9 @@ def _serialize_pretty(expr: List, indent: int) -> str:
def parse_file(path: str) -> Any:
"""Parse an S-expression file."""
"""Parse an S-expression file (supports multiple top-level expressions)."""
with open(path, 'r') as f:
return parse(f.read())
return parse_all(f.read())
def to_sexp(obj: Any) -> str:

View File

@@ -0,0 +1,220 @@
"""
GPU-Accelerated Blending Primitives Library
Uses CuPy for CUDA-accelerated image blending and compositing.
Keeps frames on GPU when STREAMING_GPU_PERSIST=1 for maximum performance.
"""
import os
import numpy as np
# Try to import CuPy for GPU acceleration
try:
import cupy as cp
GPU_AVAILABLE = True
print("[blending_gpu] CuPy GPU acceleration enabled")
except ImportError:
cp = np
GPU_AVAILABLE = False
print("[blending_gpu] CuPy not available, using CPU fallback")
# GPU persistence mode - keep frames on GPU between operations
GPU_PERSIST = os.environ.get("STREAMING_GPU_PERSIST", "1") == "1"
if GPU_AVAILABLE and GPU_PERSIST:
print("[blending_gpu] GPU persistence enabled - frames stay on GPU")
def _to_gpu(img):
"""Move image to GPU if available."""
if GPU_AVAILABLE and not isinstance(img, cp.ndarray):
return cp.asarray(img)
return img
def _to_cpu(img):
"""Move image back to CPU (only if GPU_PERSIST is disabled)."""
if not GPU_PERSIST and GPU_AVAILABLE and isinstance(img, cp.ndarray):
return cp.asnumpy(img)
return img
def _get_xp(img):
"""Get the array module (numpy or cupy) for the given image."""
if GPU_AVAILABLE and isinstance(img, cp.ndarray):
return cp
return np
def prim_blend_images(a, b, alpha):
"""Blend two images: a * (1-alpha) + b * alpha."""
alpha = max(0.0, min(1.0, float(alpha)))
if GPU_AVAILABLE:
a_gpu = _to_gpu(a)
b_gpu = _to_gpu(b)
result = (a_gpu.astype(cp.float32) * (1 - alpha) + b_gpu.astype(cp.float32) * alpha).astype(cp.uint8)
return _to_cpu(result)
return (a.astype(float) * (1 - alpha) + b.astype(float) * alpha).astype(np.uint8)
def prim_blend_mode(a, b, mode):
"""Blend using Photoshop-style blend modes."""
if GPU_AVAILABLE:
a_gpu = _to_gpu(a).astype(cp.float32) / 255
b_gpu = _to_gpu(b).astype(cp.float32) / 255
xp = cp
else:
a_gpu = a.astype(float) / 255
b_gpu = b.astype(float) / 255
xp = np
if mode == "multiply":
result = a_gpu * b_gpu
elif mode == "screen":
result = 1 - (1 - a_gpu) * (1 - b_gpu)
elif mode == "overlay":
mask = a_gpu < 0.5
result = xp.where(mask, 2 * a_gpu * b_gpu, 1 - 2 * (1 - a_gpu) * (1 - b_gpu))
elif mode == "soft-light":
mask = b_gpu < 0.5
result = xp.where(mask,
a_gpu - (1 - 2 * b_gpu) * a_gpu * (1 - a_gpu),
a_gpu + (2 * b_gpu - 1) * (xp.sqrt(a_gpu) - a_gpu))
elif mode == "hard-light":
mask = b_gpu < 0.5
result = xp.where(mask, 2 * a_gpu * b_gpu, 1 - 2 * (1 - a_gpu) * (1 - b_gpu))
elif mode == "color-dodge":
result = xp.clip(a_gpu / (1 - b_gpu + 0.001), 0, 1)
elif mode == "color-burn":
result = 1 - xp.clip((1 - a_gpu) / (b_gpu + 0.001), 0, 1)
elif mode == "difference":
result = xp.abs(a_gpu - b_gpu)
elif mode == "exclusion":
result = a_gpu + b_gpu - 2 * a_gpu * b_gpu
elif mode == "add":
result = xp.clip(a_gpu + b_gpu, 0, 1)
elif mode == "subtract":
result = xp.clip(a_gpu - b_gpu, 0, 1)
elif mode == "darken":
result = xp.minimum(a_gpu, b_gpu)
elif mode == "lighten":
result = xp.maximum(a_gpu, b_gpu)
else:
# Default to normal (just return b)
result = b_gpu
result = (result * 255).astype(xp.uint8)
return _to_cpu(result)
def prim_mask(img, mask_img):
"""Apply grayscale mask to image (white=opaque, black=transparent)."""
if GPU_AVAILABLE:
img_gpu = _to_gpu(img)
mask_gpu = _to_gpu(mask_img)
if len(mask_gpu.shape) == 3:
mask = mask_gpu[:, :, 0].astype(cp.float32) / 255
else:
mask = mask_gpu.astype(cp.float32) / 255
mask = mask[:, :, cp.newaxis]
result = (img_gpu.astype(cp.float32) * mask).astype(cp.uint8)
return _to_cpu(result)
if len(mask_img.shape) == 3:
mask = mask_img[:, :, 0].astype(float) / 255
else:
mask = mask_img.astype(float) / 255
mask = mask[:, :, np.newaxis]
return (img.astype(float) * mask).astype(np.uint8)
def prim_alpha_composite(base, overlay, alpha_channel):
"""Composite overlay onto base using alpha channel."""
if GPU_AVAILABLE:
base_gpu = _to_gpu(base)
overlay_gpu = _to_gpu(overlay)
alpha_gpu = _to_gpu(alpha_channel)
if len(alpha_gpu.shape) == 3:
alpha = alpha_gpu[:, :, 0].astype(cp.float32) / 255
else:
alpha = alpha_gpu.astype(cp.float32) / 255
alpha = alpha[:, :, cp.newaxis]
result = base_gpu.astype(cp.float32) * (1 - alpha) + overlay_gpu.astype(cp.float32) * alpha
return _to_cpu(result.astype(cp.uint8))
if len(alpha_channel.shape) == 3:
alpha = alpha_channel[:, :, 0].astype(float) / 255
else:
alpha = alpha_channel.astype(float) / 255
alpha = alpha[:, :, np.newaxis]
result = base.astype(float) * (1 - alpha) + overlay.astype(float) * alpha
return result.astype(np.uint8)
def prim_overlay(base, overlay, x, y, alpha=1.0):
"""Overlay image at position (x, y) with optional alpha."""
if GPU_AVAILABLE:
base_gpu = _to_gpu(base)
overlay_gpu = _to_gpu(overlay)
result = base_gpu.copy()
x, y = int(x), int(y)
oh, ow = overlay_gpu.shape[:2]
bh, bw = base_gpu.shape[:2]
# Clip to bounds
sx1 = max(0, -x)
sy1 = max(0, -y)
dx1 = max(0, x)
dy1 = max(0, y)
sx2 = min(ow, bw - x)
sy2 = min(oh, bh - y)
if sx2 > sx1 and sy2 > sy1:
src = overlay_gpu[sy1:sy2, sx1:sx2]
dst = result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)]
blended = (dst.astype(cp.float32) * (1 - alpha) + src.astype(cp.float32) * alpha)
result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)] = blended.astype(cp.uint8)
return _to_cpu(result)
result = base.copy()
x, y = int(x), int(y)
oh, ow = overlay.shape[:2]
bh, bw = base.shape[:2]
# Clip to bounds
sx1 = max(0, -x)
sy1 = max(0, -y)
dx1 = max(0, x)
dy1 = max(0, y)
sx2 = min(ow, bw - x)
sy2 = min(oh, bh - y)
if sx2 > sx1 and sy2 > sy1:
src = overlay[sy1:sy2, sx1:sx2]
dst = result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)]
blended = (dst.astype(float) * (1 - alpha) + src.astype(float) * alpha)
result[dy1:dy1+(sy2-sy1), dx1:dx1+(sx2-sx1)] = blended.astype(np.uint8)
return result
PRIMITIVES = {
# Basic blending
'blend-images': prim_blend_images,
'blend-mode': prim_blend_mode,
# Masking
'mask': prim_mask,
'alpha-composite': prim_alpha_composite,
# Overlay
'overlay': prim_overlay,
}

View File

@@ -0,0 +1,280 @@
"""
GPU-Accelerated Color Operations Library
Uses CuPy for CUDA-accelerated color transforms.
Performance Mode:
- Set STREAMING_GPU_PERSIST=1 to keep frames on GPU between operations
- This dramatically improves performance by avoiding CPU<->GPU transfers
"""
import os
import numpy as np
# Try to import CuPy for GPU acceleration
try:
import cupy as cp
GPU_AVAILABLE = True
print("[color_ops_gpu] CuPy GPU acceleration enabled")
except ImportError:
cp = np
GPU_AVAILABLE = False
print("[color_ops_gpu] CuPy not available, using CPU fallback")
# GPU persistence mode - keep frames on GPU between operations
GPU_PERSIST = os.environ.get("STREAMING_GPU_PERSIST", "1") == "1"
if GPU_AVAILABLE and GPU_PERSIST:
print("[color_ops_gpu] GPU persistence enabled - frames stay on GPU")
def _to_gpu(img):
"""Move image to GPU if available."""
if GPU_AVAILABLE and not isinstance(img, cp.ndarray):
return cp.asarray(img)
return img
def _to_cpu(img):
"""Move image back to CPU (only if GPU_PERSIST is disabled)."""
if not GPU_PERSIST and GPU_AVAILABLE and isinstance(img, cp.ndarray):
return cp.asnumpy(img)
return img
def prim_invert(img):
"""Invert image colors."""
if GPU_AVAILABLE:
img_gpu = _to_gpu(img)
return _to_cpu(255 - img_gpu)
return 255 - img
def prim_grayscale(img):
"""Convert to grayscale."""
if img.ndim != 3:
return img
if GPU_AVAILABLE:
img_gpu = _to_gpu(img.astype(np.float32))
# Standard luminance weights
gray = 0.299 * img_gpu[:, :, 0] + 0.587 * img_gpu[:, :, 1] + 0.114 * img_gpu[:, :, 2]
gray = cp.clip(gray, 0, 255).astype(cp.uint8)
# Stack to 3 channels
result = cp.stack([gray, gray, gray], axis=2)
return _to_cpu(result)
gray = 0.299 * img[:, :, 0] + 0.587 * img[:, :, 1] + 0.114 * img[:, :, 2]
gray = np.clip(gray, 0, 255).astype(np.uint8)
return np.stack([gray, gray, gray], axis=2)
def prim_brightness(img, factor=1.0):
"""Adjust brightness by factor."""
xp = cp if GPU_AVAILABLE else np
if GPU_AVAILABLE:
img_gpu = _to_gpu(img.astype(np.float32))
result = xp.clip(img_gpu * factor, 0, 255).astype(xp.uint8)
return _to_cpu(result)
return np.clip(img.astype(np.float32) * factor, 0, 255).astype(np.uint8)
def prim_contrast(img, factor=1.0):
"""Adjust contrast around midpoint."""
xp = cp if GPU_AVAILABLE else np
if GPU_AVAILABLE:
img_gpu = _to_gpu(img.astype(np.float32))
result = xp.clip((img_gpu - 128) * factor + 128, 0, 255).astype(xp.uint8)
return _to_cpu(result)
return np.clip((img.astype(np.float32) - 128) * factor + 128, 0, 255).astype(np.uint8)
# CUDA kernel for HSV hue shift
if GPU_AVAILABLE:
_hue_shift_kernel = cp.RawKernel(r'''
extern "C" __global__
void hue_shift(unsigned char* img, int width, int height, float shift) {
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x >= width || y >= height) return;
int idx = (y * width + x) * 3;
// Get RGB
float r = img[idx] / 255.0f;
float g = img[idx + 1] / 255.0f;
float b = img[idx + 2] / 255.0f;
// RGB to HSV
float max_c = fmaxf(r, fmaxf(g, b));
float min_c = fminf(r, fminf(g, b));
float delta = max_c - min_c;
float h = 0.0f, s = 0.0f, v = max_c;
if (delta > 0.00001f) {
s = delta / max_c;
if (max_c == r) {
h = 60.0f * fmodf((g - b) / delta, 6.0f);
} else if (max_c == g) {
h = 60.0f * ((b - r) / delta + 2.0f);
} else {
h = 60.0f * ((r - g) / delta + 4.0f);
}
if (h < 0) h += 360.0f;
}
// Shift hue
h = fmodf(h + shift, 360.0f);
if (h < 0) h += 360.0f;
// HSV to RGB
float c = v * s;
float x_val = c * (1.0f - fabsf(fmodf(h / 60.0f, 2.0f) - 1.0f));
float m = v - c;
float r_out, g_out, b_out;
if (h < 60) {
r_out = c; g_out = x_val; b_out = 0;
} else if (h < 120) {
r_out = x_val; g_out = c; b_out = 0;
} else if (h < 180) {
r_out = 0; g_out = c; b_out = x_val;
} else if (h < 240) {
r_out = 0; g_out = x_val; b_out = c;
} else if (h < 300) {
r_out = x_val; g_out = 0; b_out = c;
} else {
r_out = c; g_out = 0; b_out = x_val;
}
img[idx] = (unsigned char)fminf(255.0f, (r_out + m) * 255.0f);
img[idx + 1] = (unsigned char)fminf(255.0f, (g_out + m) * 255.0f);
img[idx + 2] = (unsigned char)fminf(255.0f, (b_out + m) * 255.0f);
}
''', 'hue_shift')
def prim_hue_shift(img, shift=0.0):
"""Shift hue by degrees."""
if img.ndim != 3 or img.shape[2] != 3:
return img
if not GPU_AVAILABLE:
import cv2
hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
hsv[:, :, 0] = (hsv[:, :, 0].astype(np.float32) + shift / 2) % 180
return cv2.cvtColor(hsv, cv2.COLOR_HSV2RGB)
h, w = img.shape[:2]
img_gpu = _to_gpu(img.astype(np.uint8)).copy()
block = (16, 16)
grid = ((w + block[0] - 1) // block[0], (h + block[1] - 1) // block[1])
_hue_shift_kernel(grid, block, (img_gpu, np.int32(w), np.int32(h), np.float32(shift)))
return _to_cpu(img_gpu)
def prim_saturate(img, factor=1.0):
"""Adjust saturation by factor."""
if img.ndim != 3:
return img
if not GPU_AVAILABLE:
import cv2
hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV).astype(np.float32)
hsv[:, :, 1] = np.clip(hsv[:, :, 1] * factor, 0, 255)
return cv2.cvtColor(hsv.astype(np.uint8), cv2.COLOR_HSV2RGB)
# GPU version - simple desaturation blend
img_gpu = _to_gpu(img.astype(np.float32))
gray = 0.299 * img_gpu[:, :, 0] + 0.587 * img_gpu[:, :, 1] + 0.114 * img_gpu[:, :, 2]
gray = gray[:, :, cp.newaxis]
if factor < 1.0:
# Desaturate: blend toward gray
result = img_gpu * factor + gray * (1 - factor)
else:
# Oversaturate: extrapolate away from gray
result = gray + (img_gpu - gray) * factor
result = cp.clip(result, 0, 255).astype(cp.uint8)
return _to_cpu(result)
def prim_blend(img1, img2, alpha=0.5):
"""Blend two images with alpha."""
xp = cp if GPU_AVAILABLE else np
if GPU_AVAILABLE:
img1_gpu = _to_gpu(img1.astype(np.float32))
img2_gpu = _to_gpu(img2.astype(np.float32))
result = img1_gpu * (1 - alpha) + img2_gpu * alpha
result = xp.clip(result, 0, 255).astype(xp.uint8)
return _to_cpu(result)
result = img1.astype(np.float32) * (1 - alpha) + img2.astype(np.float32) * alpha
return np.clip(result, 0, 255).astype(np.uint8)
def prim_add(img1, img2):
"""Add two images (clamped)."""
xp = cp if GPU_AVAILABLE else np
if GPU_AVAILABLE:
result = xp.clip(_to_gpu(img1).astype(np.int16) + _to_gpu(img2).astype(np.int16), 0, 255)
return _to_cpu(result.astype(xp.uint8))
return np.clip(img1.astype(np.int16) + img2.astype(np.int16), 0, 255).astype(np.uint8)
def prim_multiply(img1, img2):
"""Multiply two images (normalized)."""
xp = cp if GPU_AVAILABLE else np
if GPU_AVAILABLE:
result = (_to_gpu(img1).astype(np.float32) * _to_gpu(img2).astype(np.float32)) / 255.0
result = xp.clip(result, 0, 255).astype(xp.uint8)
return _to_cpu(result)
result = (img1.astype(np.float32) * img2.astype(np.float32)) / 255.0
return np.clip(result, 0, 255).astype(np.uint8)
def prim_screen(img1, img2):
"""Screen blend mode."""
xp = cp if GPU_AVAILABLE else np
if GPU_AVAILABLE:
i1 = _to_gpu(img1).astype(np.float32) / 255.0
i2 = _to_gpu(img2).astype(np.float32) / 255.0
result = 1.0 - (1.0 - i1) * (1.0 - i2)
result = xp.clip(result * 255, 0, 255).astype(xp.uint8)
return _to_cpu(result)
i1 = img1.astype(np.float32) / 255.0
i2 = img2.astype(np.float32) / 255.0
result = 1.0 - (1.0 - i1) * (1.0 - i2)
return np.clip(result * 255, 0, 255).astype(np.uint8)
# Import CPU primitives as fallbacks
def _get_cpu_primitives():
"""Get all primitives from CPU color_ops module as fallbacks."""
from sexp_effects.primitive_libs import color_ops
return color_ops.PRIMITIVES
# Export functions - start with CPU primitives, then override with GPU versions
PRIMITIVES = _get_cpu_primitives().copy()
# Override specific primitives with GPU-accelerated versions
PRIMITIVES.update({
'invert': prim_invert,
'grayscale': prim_grayscale,
'brightness': prim_brightness,
'contrast': prim_contrast,
'hue-shift': prim_hue_shift,
'saturate': prim_saturate,
'blend': prim_blend,
'add': prim_add,
'multiply': prim_multiply,
'screen': prim_screen,
})

View File

@@ -0,0 +1,409 @@
"""
GPU-Accelerated Geometry Primitives Library
Uses CuPy for CUDA-accelerated image transforms.
Falls back to CPU if GPU unavailable.
Performance Mode:
- Set STREAMING_GPU_PERSIST=1 to keep frames on GPU between operations
- This dramatically improves performance by avoiding CPU<->GPU transfers
- Frames only transfer to CPU at final output
"""
import os
import numpy as np
# Try to import CuPy for GPU acceleration
try:
import cupy as cp
from cupyx.scipy import ndimage as cpndimage
GPU_AVAILABLE = True
print("[geometry_gpu] CuPy GPU acceleration enabled")
except ImportError:
cp = np
GPU_AVAILABLE = False
print("[geometry_gpu] CuPy not available, using CPU fallback")
# GPU persistence mode - keep frames on GPU between operations
# Set STREAMING_GPU_PERSIST=1 for maximum performance
GPU_PERSIST = os.environ.get("STREAMING_GPU_PERSIST", "1") == "1"
if GPU_AVAILABLE and GPU_PERSIST:
print("[geometry_gpu] GPU persistence enabled - frames stay on GPU")
def _to_gpu(img):
"""Move image to GPU if available."""
if GPU_AVAILABLE and not isinstance(img, cp.ndarray):
return cp.asarray(img)
return img
def _to_cpu(img):
"""Move image back to CPU (only if GPU_PERSIST is disabled)."""
if not GPU_PERSIST and GPU_AVAILABLE and isinstance(img, cp.ndarray):
return cp.asnumpy(img)
return img
def _ensure_output_format(img):
"""Ensure output is in correct format based on GPU_PERSIST setting."""
return _to_cpu(img)
def prim_rotate(img, angle, cx=None, cy=None):
"""Rotate image by angle degrees around center (cx, cy)."""
if not GPU_AVAILABLE:
# Fallback to OpenCV
import cv2
h, w = img.shape[:2]
if cx is None:
cx = w / 2
if cy is None:
cy = h / 2
M = cv2.getRotationMatrix2D((cx, cy), angle, 1.0)
return cv2.warpAffine(img, M, (w, h))
img_gpu = _to_gpu(img)
h, w = img_gpu.shape[:2]
if cx is None:
cx = w / 2
if cy is None:
cy = h / 2
# Use cupyx.scipy.ndimage.rotate
# Note: scipy uses different angle convention
rotated = cpndimage.rotate(img_gpu, angle, reshape=False, order=1)
return _to_cpu(rotated)
def prim_scale(img, sx, sy, cx=None, cy=None):
"""Scale image by (sx, sy) around center (cx, cy)."""
if not GPU_AVAILABLE:
import cv2
h, w = img.shape[:2]
if cx is None:
cx = w / 2
if cy is None:
cy = h / 2
M = np.float32([
[sx, 0, cx * (1 - sx)],
[0, sy, cy * (1 - sy)]
])
return cv2.warpAffine(img, M, (w, h))
img_gpu = _to_gpu(img)
h, w = img_gpu.shape[:2]
if cx is None:
cx = w / 2
if cy is None:
cy = h / 2
# Use cupyx.scipy.ndimage.zoom
if img_gpu.ndim == 3:
zoom_factors = (sy, sx, 1) # Don't zoom color channels
else:
zoom_factors = (sy, sx)
zoomed = cpndimage.zoom(img_gpu, zoom_factors, order=1)
# Crop/pad to original size
zh, zw = zoomed.shape[:2]
result = cp.zeros_like(img_gpu)
# Calculate offsets
src_y = max(0, (zh - h) // 2)
src_x = max(0, (zw - w) // 2)
dst_y = max(0, (h - zh) // 2)
dst_x = max(0, (w - zw) // 2)
copy_h = min(h - dst_y, zh - src_y)
copy_w = min(w - dst_x, zw - src_x)
result[dst_y:dst_y+copy_h, dst_x:dst_x+copy_w] = zoomed[src_y:src_y+copy_h, src_x:src_x+copy_w]
return _to_cpu(result)
def prim_translate(img, dx, dy):
"""Translate image by (dx, dy) pixels."""
if not GPU_AVAILABLE:
import cv2
h, w = img.shape[:2]
M = np.float32([[1, 0, dx], [0, 1, dy]])
return cv2.warpAffine(img, M, (w, h))
img_gpu = _to_gpu(img)
# Use cupyx.scipy.ndimage.shift
if img_gpu.ndim == 3:
shift = (dy, dx, 0) # Don't shift color channels
else:
shift = (dy, dx)
shifted = cpndimage.shift(img_gpu, shift, order=1)
return _to_cpu(shifted)
def prim_flip_h(img):
"""Flip image horizontally."""
if GPU_AVAILABLE:
img_gpu = _to_gpu(img)
return _to_cpu(cp.flip(img_gpu, axis=1))
return np.flip(img, axis=1)
def prim_flip_v(img):
"""Flip image vertically."""
if GPU_AVAILABLE:
img_gpu = _to_gpu(img)
return _to_cpu(cp.flip(img_gpu, axis=0))
return np.flip(img, axis=0)
def prim_flip(img, direction="horizontal"):
"""Flip image in given direction."""
if direction in ("horizontal", "h"):
return prim_flip_h(img)
elif direction in ("vertical", "v"):
return prim_flip_v(img)
elif direction in ("both", "hv", "vh"):
if GPU_AVAILABLE:
img_gpu = _to_gpu(img)
return _to_cpu(cp.flip(cp.flip(img_gpu, axis=0), axis=1))
return np.flip(np.flip(img, axis=0), axis=1)
return img
# CUDA kernel for ripple effect
if GPU_AVAILABLE:
_ripple_kernel = cp.RawKernel(r'''
extern "C" __global__
void ripple(const unsigned char* src, unsigned char* dst,
int width, int height, int channels,
float amplitude, float frequency, float decay,
float speed, float time, float cx, float cy) {
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x >= width || y >= height) return;
// Distance from center
float dx = x - cx;
float dy = y - cy;
float dist = sqrtf(dx * dx + dy * dy);
// Ripple displacement
float wave = sinf(dist * frequency * 0.1f - time * speed) * amplitude;
float falloff = expf(-dist * decay * 0.01f);
float displacement = wave * falloff;
// Direction from center
float len = dist + 0.0001f; // Avoid division by zero
float dir_x = dx / len;
float dir_y = dy / len;
// Source coordinates
float src_x = x - dir_x * displacement;
float src_y = y - dir_y * displacement;
// Clamp to bounds
src_x = fmaxf(0.0f, fminf(width - 1.0f, src_x));
src_y = fmaxf(0.0f, fminf(height - 1.0f, src_y));
// Bilinear interpolation
int x0 = (int)src_x;
int y0 = (int)src_y;
int x1 = min(x0 + 1, width - 1);
int y1 = min(y0 + 1, height - 1);
float fx = src_x - x0;
float fy = src_y - y0;
for (int c = 0; c < channels; c++) {
float v00 = src[(y0 * width + x0) * channels + c];
float v10 = src[(y0 * width + x1) * channels + c];
float v01 = src[(y1 * width + x0) * channels + c];
float v11 = src[(y1 * width + x1) * channels + c];
float v0 = v00 * (1 - fx) + v10 * fx;
float v1 = v01 * (1 - fx) + v11 * fx;
float val = v0 * (1 - fy) + v1 * fy;
dst[(y * width + x) * channels + c] = (unsigned char)fminf(255.0f, fmaxf(0.0f, val));
}
}
''', 'ripple')
def prim_ripple(img, amplitude=10.0, frequency=8.0, decay=2.0, speed=5.0,
time=0.0, center_x=None, center_y=None):
"""Apply ripple distortion effect."""
h, w = img.shape[:2]
channels = img.shape[2] if img.ndim == 3 else 1
if center_x is None:
center_x = w / 2
if center_y is None:
center_y = h / 2
if not GPU_AVAILABLE:
# CPU fallback using coordinate mapping
import cv2
y_coords, x_coords = np.mgrid[0:h, 0:w].astype(np.float32)
dx = x_coords - center_x
dy = y_coords - center_y
dist = np.sqrt(dx**2 + dy**2)
wave = np.sin(dist * frequency * 0.1 - time * speed) * amplitude
falloff = np.exp(-dist * decay * 0.01)
displacement = wave * falloff
length = dist + 0.0001
dir_x = dx / length
dir_y = dy / length
map_x = (x_coords - dir_x * displacement).astype(np.float32)
map_y = (y_coords - dir_y * displacement).astype(np.float32)
return cv2.remap(img, map_x, map_y, cv2.INTER_LINEAR)
# GPU implementation
img_gpu = _to_gpu(img.astype(np.uint8))
if img_gpu.ndim == 2:
img_gpu = img_gpu[:, :, cp.newaxis]
channels = 1
dst = cp.zeros_like(img_gpu)
block = (16, 16)
grid = ((w + block[0] - 1) // block[0], (h + block[1] - 1) // block[1])
_ripple_kernel(grid, block, (
img_gpu, dst,
np.int32(w), np.int32(h), np.int32(channels),
np.float32(amplitude), np.float32(frequency), np.float32(decay),
np.float32(speed), np.float32(time),
np.float32(center_x), np.float32(center_y)
))
result = _to_cpu(dst)
if channels == 1:
result = result[:, :, 0]
return result
# CUDA kernel for fast rotation with bilinear interpolation
if GPU_AVAILABLE:
_rotate_kernel = cp.RawKernel(r'''
extern "C" __global__
void rotate_img(const unsigned char* src, unsigned char* dst,
int width, int height, int channels,
float cos_a, float sin_a, float cx, float cy) {
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x >= width || y >= height) return;
// Translate to center, rotate, translate back
float dx = x - cx;
float dy = y - cy;
float src_x = cos_a * dx + sin_a * dy + cx;
float src_y = -sin_a * dx + cos_a * dy + cy;
// Check bounds
if (src_x < 0 || src_x >= width - 1 || src_y < 0 || src_y >= height - 1) {
for (int c = 0; c < channels; c++) {
dst[(y * width + x) * channels + c] = 0;
}
return;
}
// Bilinear interpolation
int x0 = (int)src_x;
int y0 = (int)src_y;
int x1 = x0 + 1;
int y1 = y0 + 1;
float fx = src_x - x0;
float fy = src_y - y0;
for (int c = 0; c < channels; c++) {
float v00 = src[(y0 * width + x0) * channels + c];
float v10 = src[(y0 * width + x1) * channels + c];
float v01 = src[(y1 * width + x0) * channels + c];
float v11 = src[(y1 * width + x1) * channels + c];
float v0 = v00 * (1 - fx) + v10 * fx;
float v1 = v01 * (1 - fx) + v11 * fx;
float val = v0 * (1 - fy) + v1 * fy;
dst[(y * width + x) * channels + c] = (unsigned char)fminf(255.0f, fmaxf(0.0f, val));
}
}
''', 'rotate_img')
def prim_rotate_gpu(img, angle, cx=None, cy=None):
"""Fast GPU rotation using custom CUDA kernel."""
if not GPU_AVAILABLE:
return prim_rotate(img, angle, cx, cy)
h, w = img.shape[:2]
channels = img.shape[2] if img.ndim == 3 else 1
if cx is None:
cx = w / 2
if cy is None:
cy = h / 2
img_gpu = _to_gpu(img.astype(np.uint8))
if img_gpu.ndim == 2:
img_gpu = img_gpu[:, :, cp.newaxis]
channels = 1
dst = cp.zeros_like(img_gpu)
# Convert angle to radians
rad = np.radians(angle)
cos_a = np.cos(rad)
sin_a = np.sin(rad)
block = (16, 16)
grid = ((w + block[0] - 1) // block[0], (h + block[1] - 1) // block[1])
_rotate_kernel(grid, block, (
img_gpu, dst,
np.int32(w), np.int32(h), np.int32(channels),
np.float32(cos_a), np.float32(sin_a),
np.float32(cx), np.float32(cy)
))
result = _to_cpu(dst)
if channels == 1:
result = result[:, :, 0]
return result
# Import CPU primitives as fallbacks for functions we don't GPU-accelerate
def _get_cpu_primitives():
"""Get all primitives from CPU geometry module as fallbacks."""
from sexp_effects.primitive_libs import geometry
return geometry.PRIMITIVES
# Export functions - start with CPU primitives, then override with GPU versions
PRIMITIVES = _get_cpu_primitives().copy()
# Override specific primitives with GPU-accelerated versions
PRIMITIVES.update({
'translate': prim_translate,
'rotate-img': prim_rotate_gpu if GPU_AVAILABLE else prim_rotate,
'scale-img': prim_scale,
'flip-h': prim_flip_h,
'flip-v': prim_flip_v,
'flip': prim_flip,
# Note: ripple-displace uses CPU version (different API - returns coords, not image)
})

View File

@@ -8,10 +8,16 @@ import cv2
def prim_width(img):
if isinstance(img, (list, tuple)):
raise TypeError(f"image:width expects an image array, got {type(img).__name__} with {len(img)} elements")
return img.shape[1]
def prim_height(img):
if isinstance(img, (list, tuple)):
import sys
print(f"DEBUG image:height got list: {img[:3]}... (types: {[type(x).__name__ for x in img[:3]]})", file=sys.stderr)
raise TypeError(f"image:height expects an image array, got {type(img).__name__} with {len(img)} elements: {img}")
return img.shape[0]

View File

@@ -3,13 +3,52 @@ Streaming primitives for video/audio processing.
These primitives handle video source reading and audio analysis,
keeping the interpreter completely generic.
GPU Acceleration:
- Set STREAMING_GPU_PERSIST=1 to output CuPy arrays (frames stay on GPU)
- Hardware video decoding (NVDEC) is used when available
- Dramatically improves performance on GPU nodes
"""
import os
import numpy as np
import subprocess
import json
from pathlib import Path
# Try to import CuPy for GPU acceleration
try:
import cupy as cp
CUPY_AVAILABLE = True
except ImportError:
cp = None
CUPY_AVAILABLE = False
# GPU persistence mode - output CuPy arrays instead of numpy
GPU_PERSIST = os.environ.get("STREAMING_GPU_PERSIST", "1") == "1" and CUPY_AVAILABLE
# Check for hardware decode support (cached)
_HWDEC_AVAILABLE = None
def _check_hwdec():
"""Check if NVIDIA hardware decode is available."""
global _HWDEC_AVAILABLE
if _HWDEC_AVAILABLE is not None:
return _HWDEC_AVAILABLE
try:
result = subprocess.run(["nvidia-smi"], capture_output=True, timeout=2)
if result.returncode != 0:
_HWDEC_AVAILABLE = False
return False
result = subprocess.run(["ffmpeg", "-hwaccels"], capture_output=True, text=True, timeout=5)
_HWDEC_AVAILABLE = "cuda" in result.stdout
except Exception:
_HWDEC_AVAILABLE = False
return _HWDEC_AVAILABLE
class VideoSource:
"""Video source with persistent streaming pipe for fast sequential reads."""
@@ -57,7 +96,10 @@ class VideoSource:
print(f"VideoSource: {self.path.name} duration={self._duration} size={self._frame_size}", file=sys.stderr)
def _start_stream(self, seek_time: float = 0):
"""Start or restart the ffmpeg streaming process."""
"""Start or restart the ffmpeg streaming process.
Uses NVIDIA hardware decoding (NVDEC) when available for better performance.
"""
if self._proc:
self._proc.kill()
self._proc = None
@@ -67,15 +109,23 @@ class VideoSource:
raise FileNotFoundError(f"Video file not found: {self.path}")
w, h = self._frame_size
cmd = [
"ffmpeg", "-v", "error", # Show errors instead of quiet
# Build ffmpeg command with optional hardware decode
cmd = ["ffmpeg", "-v", "error"]
# Use hardware decode if available (significantly faster)
if _check_hwdec():
cmd.extend(["-hwaccel", "cuda"])
cmd.extend([
"-ss", f"{seek_time:.3f}",
"-i", str(self.path),
"-f", "rawvideo", "-pix_fmt", "rgb24",
"-s", f"{w}x{h}",
"-r", str(self.fps), # Output at specified fps
"-"
]
])
self._proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
self._stream_time = seek_time
@@ -88,8 +138,11 @@ class VideoSource:
if err:
print(f"ffmpeg error for {self.path.name}: {err}", file=sys.stderr)
def _read_frame_from_stream(self) -> np.ndarray:
"""Read one frame from the stream."""
def _read_frame_from_stream(self):
"""Read one frame from the stream.
Returns CuPy array if GPU_PERSIST is enabled, numpy array otherwise.
"""
w, h = self._frame_size
frame_size = w * h * 3
@@ -100,7 +153,12 @@ class VideoSource:
if len(data) < frame_size:
return None
return np.frombuffer(data, dtype=np.uint8).reshape((h, w, 3)).copy()
frame = np.frombuffer(data, dtype=np.uint8).reshape((h, w, 3)).copy()
# Transfer to GPU if persistence mode enabled
if GPU_PERSIST:
return cp.asarray(frame)
return frame
def read(self) -> np.ndarray:
"""Read frame (uses last cached or t=0)."""
@@ -120,6 +178,9 @@ class VideoSource:
seek_time = t
if self._duration and self._duration > 0:
seek_time = t % self._duration
# If we're within 0.1s of the end, wrap to beginning to avoid EOF issues
if seek_time > self._duration - 0.1:
seek_time = 0.0
# Decide whether to seek or continue streaming
# Seek if: no stream, going backwards (more than 1 frame), or jumping more than 2 seconds ahead
@@ -138,24 +199,59 @@ class VideoSource:
self._start_stream(seek_time)
# Skip frames to reach target time
skip_retries = 0
while self._stream_time + self._frame_time <= seek_time:
frame = self._read_frame_from_stream()
if frame is None:
# Stream ended, restart from seek point
# Stream ended or failed - restart from seek point
import time
skip_retries += 1
if skip_retries > 3:
# Give up skipping, just start fresh at seek_time
self._start_stream(seek_time)
time.sleep(0.1)
break
self._start_stream(seek_time)
time.sleep(0.05)
continue
self._stream_time += self._frame_time
skip_retries = 0 # Reset on successful read
# Read the target frame
# Read the target frame with retry logic
frame = None
max_retries = 3
for attempt in range(max_retries):
frame = self._read_frame_from_stream()
if frame is None:
if frame is not None:
break
# Stream failed - try restarting
import sys
import time
print(f"RETRY {self.path.name}: attempt {attempt+1}/{max_retries} at t={t:.2f}", file=sys.stderr)
# Check for ffmpeg errors
if self._proc and self._proc.stderr:
try:
import select
readable, _, _ = select.select([self._proc.stderr], [], [], 0.1)
if readable:
err = self._proc.stderr.read(4096).decode('utf-8', errors='ignore')
if err:
raise RuntimeError(f"Failed to read video frame from {self.path.name}: {err}")
raise RuntimeError(f"Failed to read video frame from {self.path.name} at t={t:.2f} - file may be corrupted or inaccessible")
print(f"ffmpeg error: {err}", file=sys.stderr)
except:
pass
# Wait a bit and restart
time.sleep(0.1)
self._start_stream(seek_time)
# Give ffmpeg time to start
time.sleep(0.1)
if frame is None:
import sys
raise RuntimeError(f"Failed to read video frame from {self.path.name} at t={t:.2f} after {max_retries} retries")
else:
self._stream_time += self._frame_time

View File

@@ -0,0 +1,502 @@
"""
GPU-Accelerated Streaming Primitives
Provides GPU-native video source and frame processing.
Frames stay on GPU memory throughout the pipeline for maximum performance.
Architecture:
- GPUFrame: Wrapper that tracks whether data is on CPU or GPU
- GPUVideoSource: Hardware-accelerated decode to GPU memory
- GPU primitives operate directly on GPU frames
- Transfer to CPU only at final output
Requirements:
- CuPy for CUDA support
- FFmpeg with NVDEC support (for hardware decode)
- NVIDIA GPU with CUDA capability
"""
import os
import sys
import json
import subprocess
import numpy as np
from pathlib import Path
from typing import Optional, Tuple, Union
# Try to import CuPy
try:
import cupy as cp
GPU_AVAILABLE = True
except ImportError:
cp = None
GPU_AVAILABLE = False
# Check for hardware decode support
_HWDEC_AVAILABLE: Optional[bool] = None
def check_hwdec_available() -> bool:
"""Check if NVIDIA hardware decode is available."""
global _HWDEC_AVAILABLE
if _HWDEC_AVAILABLE is not None:
return _HWDEC_AVAILABLE
try:
# Check for nvidia-smi (GPU present)
result = subprocess.run(["nvidia-smi"], capture_output=True, timeout=2)
if result.returncode != 0:
_HWDEC_AVAILABLE = False
return False
# Check for nvdec in ffmpeg
result = subprocess.run(
["ffmpeg", "-hwaccels"],
capture_output=True,
text=True,
timeout=5
)
_HWDEC_AVAILABLE = "cuda" in result.stdout
except Exception:
_HWDEC_AVAILABLE = False
return _HWDEC_AVAILABLE
class GPUFrame:
"""
Frame container that tracks data location (CPU/GPU).
Enables zero-copy operations when data is already on the right device.
Lazy transfer - only moves data when actually needed.
"""
def __init__(self, data: Union[np.ndarray, 'cp.ndarray'], on_gpu: bool = None):
self._cpu_data: Optional[np.ndarray] = None
self._gpu_data = None # Optional[cp.ndarray]
if on_gpu is None:
# Auto-detect based on type
if GPU_AVAILABLE and isinstance(data, cp.ndarray):
self._gpu_data = data
else:
self._cpu_data = np.asarray(data)
elif on_gpu and GPU_AVAILABLE:
self._gpu_data = cp.asarray(data) if not isinstance(data, cp.ndarray) else data
else:
self._cpu_data = np.asarray(data) if isinstance(data, np.ndarray) else cp.asnumpy(data)
@property
def cpu(self) -> np.ndarray:
"""Get frame as numpy array (transfers from GPU if needed)."""
if self._cpu_data is None:
if self._gpu_data is not None and GPU_AVAILABLE:
self._cpu_data = cp.asnumpy(self._gpu_data)
else:
raise ValueError("No frame data available")
return self._cpu_data
@property
def gpu(self):
"""Get frame as CuPy array (transfers to GPU if needed)."""
if not GPU_AVAILABLE:
raise RuntimeError("GPU not available")
if self._gpu_data is None:
if self._cpu_data is not None:
self._gpu_data = cp.asarray(self._cpu_data)
else:
raise ValueError("No frame data available")
return self._gpu_data
@property
def is_on_gpu(self) -> bool:
"""Check if data is currently on GPU."""
return self._gpu_data is not None
@property
def shape(self) -> Tuple[int, ...]:
"""Get frame shape."""
if self._gpu_data is not None:
return self._gpu_data.shape
return self._cpu_data.shape
@property
def dtype(self):
"""Get frame dtype."""
if self._gpu_data is not None:
return self._gpu_data.dtype
return self._cpu_data.dtype
def numpy(self) -> np.ndarray:
"""Alias for cpu property."""
return self.cpu
def cupy(self):
"""Alias for gpu property."""
return self.gpu
def free_cpu(self):
"""Free CPU memory (keep GPU only)."""
if self._gpu_data is not None:
self._cpu_data = None
def free_gpu(self):
"""Free GPU memory (keep CPU only)."""
if self._cpu_data is not None:
self._gpu_data = None
class GPUVideoSource:
"""
GPU-accelerated video source using hardware decode.
Uses NVDEC for hardware video decoding when available,
keeping decoded frames in GPU memory for zero-copy processing.
Falls back to CPU decode if hardware decode unavailable.
"""
def __init__(self, path: str, fps: float = 30, prefer_gpu: bool = True):
self.path = Path(path)
self.fps = fps
self.prefer_gpu = prefer_gpu and GPU_AVAILABLE and check_hwdec_available()
self._frame_size: Optional[Tuple[int, int]] = None
self._duration: Optional[float] = None
self._proc = None
self._stream_time = 0.0
self._frame_time = 1.0 / fps
self._last_read_time = -1
self._cached_frame: Optional[GPUFrame] = None
# Get video info
self._probe_video()
print(f"[GPUVideoSource] {self.path.name}: {self._frame_size}, "
f"duration={self._duration:.1f}s, gpu={self.prefer_gpu}", file=sys.stderr)
def _probe_video(self):
"""Probe video file for metadata."""
cmd = ["ffprobe", "-v", "quiet", "-print_format", "json",
"-show_streams", "-show_format", str(self.path)]
result = subprocess.run(cmd, capture_output=True, text=True)
info = json.loads(result.stdout)
for stream in info.get("streams", []):
if stream.get("codec_type") == "video":
self._frame_size = (stream.get("width", 720), stream.get("height", 720))
if "duration" in stream:
self._duration = float(stream["duration"])
elif "tags" in stream and "DURATION" in stream["tags"]:
dur_str = stream["tags"]["DURATION"]
parts = dur_str.split(":")
if len(parts) == 3:
h, m, s = parts
self._duration = int(h) * 3600 + int(m) * 60 + float(s)
break
if self._duration is None and "format" in info:
if "duration" in info["format"]:
self._duration = float(info["format"]["duration"])
if not self._frame_size:
self._frame_size = (720, 720)
if not self._duration:
self._duration = 60.0
def _start_stream(self, seek_time: float = 0):
"""Start ffmpeg decode process."""
if self._proc:
self._proc.kill()
self._proc = None
if not self.path.exists():
raise FileNotFoundError(f"Video file not found: {self.path}")
w, h = self._frame_size
# Build ffmpeg command
cmd = ["ffmpeg", "-v", "error"]
# Hardware decode if available
if self.prefer_gpu:
cmd.extend(["-hwaccel", "cuda"])
cmd.extend([
"-ss", f"{seek_time:.3f}",
"-i", str(self.path),
"-f", "rawvideo",
"-pix_fmt", "rgb24",
"-s", f"{w}x{h}",
"-r", str(self.fps),
"-"
])
self._proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
self._stream_time = seek_time
def _read_frame_raw(self) -> Optional[np.ndarray]:
"""Read one frame from ffmpeg pipe."""
w, h = self._frame_size
frame_size = w * h * 3
if not self._proc or self._proc.poll() is not None:
return None
data = self._proc.stdout.read(frame_size)
if len(data) < frame_size:
return None
return np.frombuffer(data, dtype=np.uint8).reshape((h, w, 3)).copy()
def read_at(self, t: float) -> Optional[GPUFrame]:
"""
Read frame at specific time.
Returns GPUFrame with data on GPU if GPU mode enabled.
"""
# Cache check
if t == self._last_read_time and self._cached_frame is not None:
return self._cached_frame
# Loop time for shorter videos
seek_time = t
if self._duration and self._duration > 0:
seek_time = t % self._duration
if seek_time > self._duration - 0.1:
seek_time = 0.0
# Determine if we need to seek
need_seek = (
self._proc is None or
self._proc.poll() is not None or
seek_time < self._stream_time - self._frame_time or
seek_time > self._stream_time + 2.0
)
if need_seek:
self._start_stream(seek_time)
# Skip frames to reach target
while self._stream_time + self._frame_time <= seek_time:
frame = self._read_frame_raw()
if frame is None:
self._start_stream(seek_time)
break
self._stream_time += self._frame_time
# Read target frame
frame_np = self._read_frame_raw()
if frame_np is None:
return self._cached_frame
self._stream_time += self._frame_time
self._last_read_time = t
# Create GPUFrame - transfer to GPU if in GPU mode
self._cached_frame = GPUFrame(frame_np, on_gpu=self.prefer_gpu)
# Free CPU copy if on GPU (saves memory)
if self.prefer_gpu and self._cached_frame.is_on_gpu:
self._cached_frame.free_cpu()
return self._cached_frame
def read(self) -> Optional[GPUFrame]:
"""Read current frame."""
if self._cached_frame is not None:
return self._cached_frame
return self.read_at(0)
@property
def size(self) -> Tuple[int, int]:
return self._frame_size
@property
def duration(self) -> float:
return self._duration
def close(self):
"""Close the video source."""
if self._proc:
self._proc.kill()
self._proc = None
# GPU-aware primitive functions
def gpu_blend(frame_a: GPUFrame, frame_b: GPUFrame, alpha: float = 0.5) -> GPUFrame:
"""
Blend two frames on GPU.
Both frames stay on GPU throughout - no CPU transfer.
"""
if not GPU_AVAILABLE:
a = frame_a.cpu.astype(np.float32)
b = frame_b.cpu.astype(np.float32)
result = (a * alpha + b * (1 - alpha)).astype(np.uint8)
return GPUFrame(result, on_gpu=False)
a = frame_a.gpu.astype(cp.float32)
b = frame_b.gpu.astype(cp.float32)
result = (a * alpha + b * (1 - alpha)).astype(cp.uint8)
return GPUFrame(result, on_gpu=True)
def gpu_resize(frame: GPUFrame, size: Tuple[int, int]) -> GPUFrame:
"""Resize frame on GPU."""
import cv2
if not GPU_AVAILABLE or not frame.is_on_gpu:
resized = cv2.resize(frame.cpu, size)
return GPUFrame(resized, on_gpu=False)
# CuPy doesn't have built-in resize, use scipy zoom
from cupyx.scipy import ndimage as cpndimage
gpu_data = frame.gpu
h, w = gpu_data.shape[:2]
target_w, target_h = size
zoom_y = target_h / h
zoom_x = target_w / w
if gpu_data.ndim == 3:
resized = cpndimage.zoom(gpu_data, (zoom_y, zoom_x, 1), order=1)
else:
resized = cpndimage.zoom(gpu_data, (zoom_y, zoom_x), order=1)
return GPUFrame(resized, on_gpu=True)
def gpu_rotate(frame: GPUFrame, angle: float) -> GPUFrame:
"""Rotate frame on GPU."""
if not GPU_AVAILABLE or not frame.is_on_gpu:
import cv2
h, w = frame.cpu.shape[:2]
center = (w // 2, h // 2)
M = cv2.getRotationMatrix2D(center, angle, 1.0)
rotated = cv2.warpAffine(frame.cpu, M, (w, h))
return GPUFrame(rotated, on_gpu=False)
from cupyx.scipy import ndimage as cpndimage
rotated = cpndimage.rotate(frame.gpu, angle, reshape=False, order=1)
return GPUFrame(rotated, on_gpu=True)
def gpu_brightness(frame: GPUFrame, factor: float) -> GPUFrame:
"""Adjust brightness on GPU."""
if not GPU_AVAILABLE or not frame.is_on_gpu:
result = np.clip(frame.cpu.astype(np.float32) * factor, 0, 255).astype(np.uint8)
return GPUFrame(result, on_gpu=False)
result = cp.clip(frame.gpu.astype(cp.float32) * factor, 0, 255).astype(cp.uint8)
return GPUFrame(result, on_gpu=True)
def gpu_composite(frames: list, weights: list = None) -> GPUFrame:
"""
Composite multiple frames with weights.
All frames processed on GPU for efficiency.
"""
if not frames:
raise ValueError("No frames to composite")
if len(frames) == 1:
return frames[0]
if weights is None:
weights = [1.0 / len(frames)] * len(frames)
# Normalize weights
total = sum(weights)
if total > 0:
weights = [w / total for w in weights]
use_gpu = GPU_AVAILABLE and any(f.is_on_gpu for f in frames)
if use_gpu:
# All on GPU
target_shape = frames[0].gpu.shape
result = cp.zeros(target_shape, dtype=cp.float32)
for frame, weight in zip(frames, weights):
gpu_data = frame.gpu.astype(cp.float32)
if gpu_data.shape != target_shape:
# Resize to match
from cupyx.scipy import ndimage as cpndimage
h, w = target_shape[:2]
fh, fw = gpu_data.shape[:2]
zoom_factors = (h/fh, w/fw, 1) if gpu_data.ndim == 3 else (h/fh, w/fw)
gpu_data = cpndimage.zoom(gpu_data, zoom_factors, order=1)
result += gpu_data * weight
return GPUFrame(cp.clip(result, 0, 255).astype(cp.uint8), on_gpu=True)
else:
# All on CPU
import cv2
target_shape = frames[0].cpu.shape
result = np.zeros(target_shape, dtype=np.float32)
for frame, weight in zip(frames, weights):
cpu_data = frame.cpu.astype(np.float32)
if cpu_data.shape != target_shape:
cpu_data = cv2.resize(cpu_data, (target_shape[1], target_shape[0]))
result += cpu_data * weight
return GPUFrame(np.clip(result, 0, 255).astype(np.uint8), on_gpu=False)
# Primitive registration for streaming interpreter
def get_primitives():
"""
Get GPU-aware primitives for registration with interpreter.
These wrap the GPU functions to work with the sexp interpreter.
"""
def prim_make_video_source_gpu(path: str, fps: float = 30):
"""Create GPU-accelerated video source."""
return GPUVideoSource(path, fps, prefer_gpu=True)
def prim_gpu_blend(a, b, alpha=0.5):
"""Blend two frames."""
fa = a if isinstance(a, GPUFrame) else GPUFrame(a)
fb = b if isinstance(b, GPUFrame) else GPUFrame(b)
result = gpu_blend(fa, fb, alpha)
return result.cpu # Return numpy for compatibility
def prim_gpu_rotate(img, angle):
"""Rotate image."""
f = img if isinstance(img, GPUFrame) else GPUFrame(img)
result = gpu_rotate(f, angle)
return result.cpu
def prim_gpu_brightness(img, factor):
"""Adjust brightness."""
f = img if isinstance(img, GPUFrame) else GPUFrame(img)
result = gpu_brightness(f, factor)
return result.cpu
return {
'streaming-gpu:make-video-source': prim_make_video_source_gpu,
'gpu:blend': prim_gpu_blend,
'gpu:rotate': prim_gpu_rotate,
'gpu:brightness': prim_gpu_brightness,
}
# Export
__all__ = [
'GPU_AVAILABLE',
'GPUFrame',
'GPUVideoSource',
'gpu_blend',
'gpu_resize',
'gpu_rotate',
'gpu_brightness',
'gpu_composite',
'get_primitives',
'check_hwdec_available',
]

View File

@@ -0,0 +1,715 @@
"""
S-Expression to WGSL Compiler
Compiles sexp effect definitions to WGSL compute shaders for GPU execution.
The compilation happens at effect upload time (AOT), not at runtime.
Architecture:
- Parse sexp AST
- Analyze primitives used
- Generate WGSL compute shader
Shader Categories:
1. Per-pixel ops: brightness, invert, grayscale, sepia (1 thread per pixel)
2. Geometric transforms: rotate, scale, wave, ripple (coordinate remap + sample)
3. Neighborhood ops: blur, sharpen, edge detect (sample neighbors)
"""
from typing import Dict, List, Any, Optional, Tuple, Set
from dataclasses import dataclass, field
from pathlib import Path
import math
from .parser import parse, parse_all, Symbol, Keyword
@dataclass
class WGSLParam:
"""A shader parameter (uniform)."""
name: str
wgsl_type: str # f32, i32, u32, vec2f, etc.
default: Any
@dataclass
class CompiledEffect:
"""Result of compiling an sexp effect to WGSL."""
name: str
wgsl_code: str
params: List[WGSLParam]
workgroup_size: Tuple[int, int, int] = (16, 16, 1)
# Metadata for runtime
uses_time: bool = False
uses_sampling: bool = False # Needs texture sampler
category: str = "per_pixel" # per_pixel, geometric, neighborhood
@dataclass
class CompilerContext:
"""Context during compilation."""
effect_name: str = ""
params: Dict[str, WGSLParam] = field(default_factory=dict)
locals: Dict[str, str] = field(default_factory=dict) # local var -> wgsl expr
required_libs: Set[str] = field(default_factory=set)
uses_time: bool = False
uses_sampling: bool = False
temp_counter: int = 0
def fresh_temp(self) -> str:
"""Generate a fresh temporary variable name."""
self.temp_counter += 1
return f"_t{self.temp_counter}"
class SexpToWGSLCompiler:
"""
Compiles S-expression effect definitions to WGSL compute shaders.
"""
# Map sexp types to WGSL types
TYPE_MAP = {
'int': 'i32',
'float': 'f32',
'bool': 'u32', # WGSL doesn't have bool in storage
'string': None, # Strings handled specially
}
# Per-pixel primitives that can be compiled directly
PER_PIXEL_PRIMITIVES = {
'color_ops:invert-img',
'color_ops:grayscale',
'color_ops:sepia',
'color_ops:adjust',
'color_ops:adjust-brightness',
'color_ops:shift-hsv',
'color_ops:quantize',
}
# Geometric primitives (coordinate remapping)
GEOMETRIC_PRIMITIVES = {
'geometry:scale-img',
'geometry:rotate-img',
'geometry:translate',
'geometry:flip-h',
'geometry:flip-v',
'geometry:remap',
}
def __init__(self):
self.ctx: Optional[CompilerContext] = None
def compile_file(self, path: str) -> CompiledEffect:
"""Compile an effect from a .sexp file."""
with open(path, 'r') as f:
content = f.read()
exprs = parse_all(content)
return self.compile(exprs)
def compile_string(self, sexp_code: str) -> CompiledEffect:
"""Compile an effect from an sexp string."""
exprs = parse_all(sexp_code)
return self.compile(exprs)
def compile(self, expr: Any) -> CompiledEffect:
"""Compile a parsed sexp expression."""
self.ctx = CompilerContext()
# Handle multiple top-level expressions (require-primitives, define-effect)
if isinstance(expr, list) and expr and isinstance(expr[0], list):
for e in expr:
self._process_toplevel(e)
else:
self._process_toplevel(expr)
# Generate the WGSL shader
wgsl = self._generate_wgsl()
# Determine category based on primitives used
category = self._determine_category()
return CompiledEffect(
name=self.ctx.effect_name,
wgsl_code=wgsl,
params=list(self.ctx.params.values()),
uses_time=self.ctx.uses_time,
uses_sampling=self.ctx.uses_sampling,
category=category,
)
def _process_toplevel(self, expr: Any):
"""Process a top-level expression."""
if not isinstance(expr, list) or not expr:
return
head = expr[0]
if isinstance(head, Symbol):
if head.name == 'require-primitives':
# Track required primitive libraries
for lib in expr[1:]:
lib_name = lib.name if isinstance(lib, Symbol) else str(lib)
self.ctx.required_libs.add(lib_name)
elif head.name == 'define-effect':
self._compile_effect_def(expr)
def _compile_effect_def(self, expr: list):
"""Compile a define-effect form."""
# (define-effect name :params (...) body)
self.ctx.effect_name = expr[1].name if isinstance(expr[1], Symbol) else str(expr[1])
# Parse :params and body
i = 2
body = None
while i < len(expr):
item = expr[i]
if isinstance(item, Keyword) and item.name == 'params':
self._parse_params(expr[i + 1])
i += 2
elif isinstance(item, Keyword):
i += 2 # Skip other keywords
else:
body = item
i += 1
if body:
self.ctx.body_expr = body
def _parse_params(self, params_list: list):
"""Parse the :params block."""
for param_def in params_list:
if not isinstance(param_def, list):
continue
name = param_def[0].name if isinstance(param_def[0], Symbol) else str(param_def[0])
# Parse keyword args
param_type = 'float'
default = 0
i = 1
while i < len(param_def):
item = param_def[i]
if isinstance(item, Keyword):
if i + 1 < len(param_def):
val = param_def[i + 1]
if item.name == 'type':
param_type = val.name if isinstance(val, Symbol) else str(val)
elif item.name == 'default':
default = val
i += 2
else:
i += 1
wgsl_type = self.TYPE_MAP.get(param_type, 'f32')
if wgsl_type:
self.ctx.params[name] = WGSLParam(name, wgsl_type, default)
def _determine_category(self) -> str:
"""Determine shader category based on primitives used."""
for lib in self.ctx.required_libs:
if lib == 'geometry':
return 'geometric'
if lib == 'filters':
return 'neighborhood'
return 'per_pixel'
def _generate_wgsl(self) -> str:
"""Generate the complete WGSL shader code."""
lines = []
# Header comment
lines.append(f"// WGSL Shader: {self.ctx.effect_name}")
lines.append(f"// Auto-generated from sexp effect definition")
lines.append("")
# Bindings
lines.append("@group(0) @binding(0) var<storage, read> input: array<u32>;")
lines.append("@group(0) @binding(1) var<storage, read_write> output: array<u32>;")
lines.append("")
# Params struct
if self.ctx.params:
lines.append("struct Params {")
lines.append(" width: u32,")
lines.append(" height: u32,")
lines.append(" time: f32,")
for param in self.ctx.params.values():
lines.append(f" {param.name}: {param.wgsl_type},")
lines.append("}")
lines.append("@group(0) @binding(2) var<uniform> params: Params;")
else:
lines.append("struct Params {")
lines.append(" width: u32,")
lines.append(" height: u32,")
lines.append(" time: f32,")
lines.append("}")
lines.append("@group(0) @binding(2) var<uniform> params: Params;")
lines.append("")
# Helper functions
lines.extend(self._generate_helpers())
lines.append("")
# Main compute shader
lines.append("@compute @workgroup_size(16, 16, 1)")
lines.append("fn main(@builtin(global_invocation_id) gid: vec3<u32>) {")
lines.append(" let x = gid.x;")
lines.append(" let y = gid.y;")
lines.append(" if (x >= params.width || y >= params.height) { return; }")
lines.append(" let idx = y * params.width + x;")
lines.append("")
# Compile the effect body
body_code = self._compile_expr(self.ctx.body_expr)
lines.append(f" // Effect: {self.ctx.effect_name}")
lines.append(body_code)
lines.append("}")
return "\n".join(lines)
def _generate_helpers(self) -> List[str]:
"""Generate WGSL helper functions."""
helpers = []
# Pack/unpack RGB from u32
helpers.append("fn unpack_rgb(packed: u32) -> vec3<f32> {")
helpers.append(" let r = f32((packed >> 16u) & 0xFFu) / 255.0;")
helpers.append(" let g = f32((packed >> 8u) & 0xFFu) / 255.0;")
helpers.append(" let b = f32(packed & 0xFFu) / 255.0;")
helpers.append(" return vec3<f32>(r, g, b);")
helpers.append("}")
helpers.append("")
helpers.append("fn pack_rgb(rgb: vec3<f32>) -> u32 {")
helpers.append(" let r = u32(clamp(rgb.r, 0.0, 1.0) * 255.0);")
helpers.append(" let g = u32(clamp(rgb.g, 0.0, 1.0) * 255.0);")
helpers.append(" let b = u32(clamp(rgb.b, 0.0, 1.0) * 255.0);")
helpers.append(" return (r << 16u) | (g << 8u) | b;")
helpers.append("}")
helpers.append("")
# Bilinear sampling for geometric transforms
if self.ctx.uses_sampling or 'geometry' in self.ctx.required_libs:
helpers.append("fn sample_bilinear(sx: f32, sy: f32) -> vec3<f32> {")
helpers.append(" let w = f32(params.width);")
helpers.append(" let h = f32(params.height);")
helpers.append(" let cx = clamp(sx, 0.0, w - 1.001);")
helpers.append(" let cy = clamp(sy, 0.0, h - 1.001);")
helpers.append(" let x0 = u32(cx);")
helpers.append(" let y0 = u32(cy);")
helpers.append(" let x1 = min(x0 + 1u, params.width - 1u);")
helpers.append(" let y1 = min(y0 + 1u, params.height - 1u);")
helpers.append(" let fx = cx - f32(x0);")
helpers.append(" let fy = cy - f32(y0);")
helpers.append(" let c00 = unpack_rgb(input[y0 * params.width + x0]);")
helpers.append(" let c10 = unpack_rgb(input[y0 * params.width + x1]);")
helpers.append(" let c01 = unpack_rgb(input[y1 * params.width + x0]);")
helpers.append(" let c11 = unpack_rgb(input[y1 * params.width + x1]);")
helpers.append(" let top = mix(c00, c10, fx);")
helpers.append(" let bot = mix(c01, c11, fx);")
helpers.append(" return mix(top, bot, fy);")
helpers.append("}")
helpers.append("")
# HSV conversion for color effects
if 'color_ops' in self.ctx.required_libs or 'color' in self.ctx.required_libs:
helpers.append("fn rgb_to_hsv(rgb: vec3<f32>) -> vec3<f32> {")
helpers.append(" let mx = max(max(rgb.r, rgb.g), rgb.b);")
helpers.append(" let mn = min(min(rgb.r, rgb.g), rgb.b);")
helpers.append(" let d = mx - mn;")
helpers.append(" var h = 0.0;")
helpers.append(" if (d > 0.0) {")
helpers.append(" if (mx == rgb.r) { h = (rgb.g - rgb.b) / d; }")
helpers.append(" else if (mx == rgb.g) { h = 2.0 + (rgb.b - rgb.r) / d; }")
helpers.append(" else { h = 4.0 + (rgb.r - rgb.g) / d; }")
helpers.append(" h = h / 6.0;")
helpers.append(" if (h < 0.0) { h = h + 1.0; }")
helpers.append(" }")
helpers.append(" let s = select(0.0, d / mx, mx > 0.0);")
helpers.append(" return vec3<f32>(h, s, mx);")
helpers.append("}")
helpers.append("")
helpers.append("fn hsv_to_rgb(hsv: vec3<f32>) -> vec3<f32> {")
helpers.append(" let h = hsv.x * 6.0;")
helpers.append(" let s = hsv.y;")
helpers.append(" let v = hsv.z;")
helpers.append(" let c = v * s;")
helpers.append(" let x = c * (1.0 - abs(h % 2.0 - 1.0));")
helpers.append(" let m = v - c;")
helpers.append(" var rgb: vec3<f32>;")
helpers.append(" if (h < 1.0) { rgb = vec3<f32>(c, x, 0.0); }")
helpers.append(" else if (h < 2.0) { rgb = vec3<f32>(x, c, 0.0); }")
helpers.append(" else if (h < 3.0) { rgb = vec3<f32>(0.0, c, x); }")
helpers.append(" else if (h < 4.0) { rgb = vec3<f32>(0.0, x, c); }")
helpers.append(" else if (h < 5.0) { rgb = vec3<f32>(x, 0.0, c); }")
helpers.append(" else { rgb = vec3<f32>(c, 0.0, x); }")
helpers.append(" return rgb + vec3<f32>(m, m, m);")
helpers.append("}")
helpers.append("")
return helpers
def _compile_expr(self, expr: Any, indent: int = 4) -> str:
"""Compile an sexp expression to WGSL code."""
ind = " " * indent
# Literals
if isinstance(expr, (int, float)):
return f"{ind}// literal: {expr}"
if isinstance(expr, str):
return f'{ind}// string: "{expr}"'
# Symbol reference
if isinstance(expr, Symbol):
name = expr.name
if name == 'frame':
return f"{ind}let rgb = unpack_rgb(input[idx]);"
if name == 't' or name == '_time':
self.ctx.uses_time = True
return f"{ind}let t = params.time;"
if name in self.ctx.params:
return f"{ind}let {name} = params.{name};"
if name in self.ctx.locals:
return f"{ind}// local: {name}"
return f"{ind}// unknown symbol: {name}"
# List (function call or special form)
if isinstance(expr, list) and expr:
head = expr[0]
if isinstance(head, Symbol):
form = head.name
# Special forms
if form == 'let' or form == 'let*':
return self._compile_let(expr, indent)
if form == 'if':
return self._compile_if(expr, indent)
if form == 'or':
# (or a b) - return a if truthy, else b
return self._compile_or(expr, indent)
# Primitive calls
if ':' in form:
return self._compile_primitive_call(expr, indent)
# Arithmetic
if form in ('+', '-', '*', '/'):
return self._compile_arithmetic(expr, indent)
if form in ('>', '<', '>=', '<=', '='):
return self._compile_comparison(expr, indent)
if form == 'max':
return self._compile_builtin('max', expr[1:], indent)
if form == 'min':
return self._compile_builtin('min', expr[1:], indent)
return f"{ind}// unhandled: {expr}"
def _compile_let(self, expr: list, indent: int) -> str:
"""Compile let/let* binding form."""
ind = " " * indent
lines = []
bindings = expr[1]
body = expr[2]
# Parse bindings (Clojure style: [x 1 y 2] or Scheme style: ((x 1) (y 2)))
pairs = []
if bindings and isinstance(bindings[0], Symbol):
# Clojure style
i = 0
while i < len(bindings) - 1:
name = bindings[i].name if isinstance(bindings[i], Symbol) else str(bindings[i])
value = bindings[i + 1]
pairs.append((name, value))
i += 2
else:
# Scheme style
for binding in bindings:
name = binding[0].name if isinstance(binding[0], Symbol) else str(binding[0])
value = binding[1]
pairs.append((name, value))
# Compile bindings
for name, value in pairs:
val_code = self._expr_to_wgsl(value)
lines.append(f"{ind}let {name} = {val_code};")
self.ctx.locals[name] = val_code
# Compile body
body_lines = self._compile_body(body, indent)
lines.append(body_lines)
return "\n".join(lines)
def _compile_body(self, body: Any, indent: int) -> str:
"""Compile the body of an effect (the final image expression)."""
ind = " " * indent
# Most effects end with a primitive call that produces the output
if isinstance(body, list) and body:
head = body[0]
if isinstance(head, Symbol) and ':' in head.name:
return self._compile_primitive_call(body, indent)
# If body is just 'frame', pass through
if isinstance(body, Symbol) and body.name == 'frame':
return f"{ind}output[idx] = input[idx];"
return f"{ind}// body: {body}"
def _compile_primitive_call(self, expr: list, indent: int) -> str:
"""Compile a primitive function call."""
ind = " " * indent
head = expr[0]
prim_name = head.name if isinstance(head, Symbol) else str(head)
args = expr[1:]
# Per-pixel color operations
if prim_name == 'color_ops:invert-img':
return f"""{ind}let rgb = unpack_rgb(input[idx]);
{ind}let result = vec3<f32>(1.0, 1.0, 1.0) - rgb;
{ind}output[idx] = pack_rgb(result);"""
if prim_name == 'color_ops:grayscale':
return f"""{ind}let rgb = unpack_rgb(input[idx]);
{ind}let gray = 0.299 * rgb.r + 0.587 * rgb.g + 0.114 * rgb.b;
{ind}let result = vec3<f32>(gray, gray, gray);
{ind}output[idx] = pack_rgb(result);"""
if prim_name == 'color_ops:adjust-brightness':
amount = self._expr_to_wgsl(args[1]) if len(args) > 1 else "0.0"
return f"""{ind}let rgb = unpack_rgb(input[idx]);
{ind}let adj = f32({amount}) / 255.0;
{ind}let result = clamp(rgb + vec3<f32>(adj, adj, adj), vec3<f32>(0.0, 0.0, 0.0), vec3<f32>(1.0, 1.0, 1.0));
{ind}output[idx] = pack_rgb(result);"""
if prim_name == 'color_ops:adjust':
# (adjust img brightness contrast)
brightness = self._expr_to_wgsl(args[1]) if len(args) > 1 else "0.0"
contrast = self._expr_to_wgsl(args[2]) if len(args) > 2 else "1.0"
return f"""{ind}let rgb = unpack_rgb(input[idx]);
{ind}let centered = rgb - vec3<f32>(0.5, 0.5, 0.5);
{ind}let contrasted = centered * {contrast};
{ind}let brightened = contrasted + vec3<f32>(0.5, 0.5, 0.5) + vec3<f32>({brightness}/255.0);
{ind}let result = clamp(brightened, vec3<f32>(0.0), vec3<f32>(1.0));
{ind}output[idx] = pack_rgb(result);"""
if prim_name == 'color_ops:sepia':
intensity = self._expr_to_wgsl(args[1]) if len(args) > 1 else "1.0"
return f"""{ind}let rgb = unpack_rgb(input[idx]);
{ind}let sepia_r = 0.393 * rgb.r + 0.769 * rgb.g + 0.189 * rgb.b;
{ind}let sepia_g = 0.349 * rgb.r + 0.686 * rgb.g + 0.168 * rgb.b;
{ind}let sepia_b = 0.272 * rgb.r + 0.534 * rgb.g + 0.131 * rgb.b;
{ind}let sepia = vec3<f32>(sepia_r, sepia_g, sepia_b);
{ind}let result = mix(rgb, sepia, {intensity});
{ind}output[idx] = pack_rgb(clamp(result, vec3<f32>(0.0), vec3<f32>(1.0)));"""
if prim_name == 'color_ops:shift-hsv':
h_shift = self._expr_to_wgsl(args[1]) if len(args) > 1 else "0.0"
s_mult = self._expr_to_wgsl(args[2]) if len(args) > 2 else "1.0"
v_mult = self._expr_to_wgsl(args[3]) if len(args) > 3 else "1.0"
return f"""{ind}let rgb = unpack_rgb(input[idx]);
{ind}var hsv = rgb_to_hsv(rgb);
{ind}hsv.x = fract(hsv.x + {h_shift} / 360.0);
{ind}hsv.y = clamp(hsv.y * {s_mult}, 0.0, 1.0);
{ind}hsv.z = clamp(hsv.z * {v_mult}, 0.0, 1.0);
{ind}let result = hsv_to_rgb(hsv);
{ind}output[idx] = pack_rgb(result);"""
if prim_name == 'color_ops:quantize':
levels = self._expr_to_wgsl(args[1]) if len(args) > 1 else "8.0"
return f"""{ind}let rgb = unpack_rgb(input[idx]);
{ind}let lvl = max(2.0, {levels});
{ind}let result = floor(rgb * lvl) / lvl;
{ind}output[idx] = pack_rgb(result);"""
# Geometric transforms
if prim_name == 'geometry:scale-img':
sx = self._expr_to_wgsl(args[1]) if len(args) > 1 else "1.0"
sy = self._expr_to_wgsl(args[2]) if len(args) > 2 else sx
self.ctx.uses_sampling = True
return f"""{ind}let w = f32(params.width);
{ind}let h = f32(params.height);
{ind}let cx = w / 2.0;
{ind}let cy = h / 2.0;
{ind}let sx = f32(x) - cx;
{ind}let sy = f32(y) - cy;
{ind}let src_x = sx / {sx} + cx;
{ind}let src_y = sy / {sy} + cy;
{ind}let result = sample_bilinear(src_x, src_y);
{ind}output[idx] = pack_rgb(result);"""
if prim_name == 'geometry:rotate-img':
angle = self._expr_to_wgsl(args[1]) if len(args) > 1 else "0.0"
self.ctx.uses_sampling = True
return f"""{ind}let w = f32(params.width);
{ind}let h = f32(params.height);
{ind}let cx = w / 2.0;
{ind}let cy = h / 2.0;
{ind}let angle_rad = {angle} * 3.14159265 / 180.0;
{ind}let cos_a = cos(-angle_rad);
{ind}let sin_a = sin(-angle_rad);
{ind}let dx = f32(x) - cx;
{ind}let dy = f32(y) - cy;
{ind}let src_x = dx * cos_a - dy * sin_a + cx;
{ind}let src_y = dx * sin_a + dy * cos_a + cy;
{ind}let result = sample_bilinear(src_x, src_y);
{ind}output[idx] = pack_rgb(result);"""
if prim_name == 'geometry:flip-h':
return f"""{ind}let src_idx = y * params.width + (params.width - 1u - x);
{ind}output[idx] = input[src_idx];"""
if prim_name == 'geometry:flip-v':
return f"""{ind}let src_idx = (params.height - 1u - y) * params.width + x;
{ind}output[idx] = input[src_idx];"""
# Image library
if prim_name == 'image:blur':
radius = self._expr_to_wgsl(args[1]) if len(args) > 1 else "5"
# Box blur approximation (separable would be better)
return f"""{ind}let radius = i32({radius});
{ind}var sum = vec3<f32>(0.0, 0.0, 0.0);
{ind}var count = 0.0;
{ind}for (var dy = -radius; dy <= radius; dy = dy + 1) {{
{ind} for (var dx = -radius; dx <= radius; dx = dx + 1) {{
{ind} let sx = i32(x) + dx;
{ind} let sy = i32(y) + dy;
{ind} if (sx >= 0 && sx < i32(params.width) && sy >= 0 && sy < i32(params.height)) {{
{ind} let sidx = u32(sy) * params.width + u32(sx);
{ind} sum = sum + unpack_rgb(input[sidx]);
{ind} count = count + 1.0;
{ind} }}
{ind} }}
{ind}}}
{ind}let result = sum / count;
{ind}output[idx] = pack_rgb(result);"""
# Fallback - passthrough
return f"""{ind}// Unimplemented primitive: {prim_name}
{ind}output[idx] = input[idx];"""
def _compile_if(self, expr: list, indent: int) -> str:
"""Compile if expression."""
ind = " " * indent
cond = self._expr_to_wgsl(expr[1])
then_expr = expr[2]
else_expr = expr[3] if len(expr) > 3 else None
lines = []
lines.append(f"{ind}if ({cond}) {{")
lines.append(self._compile_body(then_expr, indent + 4))
if else_expr:
lines.append(f"{ind}}} else {{")
lines.append(self._compile_body(else_expr, indent + 4))
lines.append(f"{ind}}}")
return "\n".join(lines)
def _compile_or(self, expr: list, indent: int) -> str:
"""Compile or expression - returns first truthy value."""
# For numeric context, (or a b) means "a if a != 0 else b"
a = self._expr_to_wgsl(expr[1])
b = self._expr_to_wgsl(expr[2]) if len(expr) > 2 else "0.0"
return f"select({b}, {a}, {a} != 0.0)"
def _compile_arithmetic(self, expr: list, indent: int) -> str:
"""Compile arithmetic expression to inline WGSL."""
op = expr[0].name
operands = [self._expr_to_wgsl(arg) for arg in expr[1:]]
if len(operands) == 1:
if op == '-':
return f"(-{operands[0]})"
return operands[0]
return f"({f' {op} '.join(operands)})"
def _compile_comparison(self, expr: list, indent: int) -> str:
"""Compile comparison expression."""
op = expr[0].name
if op == '=':
op = '=='
a = self._expr_to_wgsl(expr[1])
b = self._expr_to_wgsl(expr[2])
return f"({a} {op} {b})"
def _compile_builtin(self, fn: str, args: list, indent: int) -> str:
"""Compile builtin function call."""
compiled_args = [self._expr_to_wgsl(arg) for arg in args]
return f"{fn}({', '.join(compiled_args)})"
def _expr_to_wgsl(self, expr: Any) -> str:
"""Convert an expression to inline WGSL code."""
if isinstance(expr, (int, float)):
# Ensure floats have decimal point
if isinstance(expr, float) or '.' not in str(expr):
return f"{float(expr)}"
return str(expr)
if isinstance(expr, str):
return f'"{expr}"'
if isinstance(expr, Symbol):
name = expr.name
if name == 'frame':
return "rgb" # Assume rgb is already loaded
if name == 't' or name == '_time':
self.ctx.uses_time = True
return "params.time"
if name == 'pi':
return "3.14159265"
if name in self.ctx.params:
return f"params.{name}"
if name in self.ctx.locals:
return name
return name
if isinstance(expr, list) and expr:
head = expr[0]
if isinstance(head, Symbol):
form = head.name
# Arithmetic
if form in ('+', '-', '*', '/'):
return self._compile_arithmetic(expr, 0)
# Comparison
if form in ('>', '<', '>=', '<=', '='):
return self._compile_comparison(expr, 0)
# Builtins
if form in ('max', 'min', 'abs', 'floor', 'ceil', 'sin', 'cos', 'sqrt'):
args = [self._expr_to_wgsl(a) for a in expr[1:]]
return f"{form}({', '.join(args)})"
if form == 'or':
return self._compile_or(expr, 0)
# Image dimension queries
if form == 'image:width':
return "f32(params.width)"
if form == 'image:height':
return "f32(params.height)"
return f"/* unknown: {expr} */"
def compile_effect(sexp_code: str) -> CompiledEffect:
"""Convenience function to compile an sexp effect string."""
compiler = SexpToWGSLCompiler()
return compiler.compile_string(sexp_code)
def compile_effect_file(path: str) -> CompiledEffect:
"""Convenience function to compile an sexp effect file."""
compiler = SexpToWGSLCompiler()
return compiler.compile_file(path)

View File

@@ -68,6 +68,8 @@ class NumpyBackend(Backend):
def load_effect(self, effect_path: Path) -> Any:
"""Load an effect from sexp file."""
if isinstance(effect_path, str):
effect_path = Path(effect_path)
effect_key = str(effect_path)
if effect_key not in self._loaded_effects:
interp = self._get_interpreter()
@@ -260,23 +262,258 @@ class NumpyBackend(Backend):
return np.clip(result, 0, 255).astype(np.uint8)
class GLSLBackend(Backend):
class WGPUBackend(Backend):
"""
GPU-based effect processing using OpenGL/GLSL.
GPU-based effect processing using wgpu/WebGPU compute shaders.
Requires GPU with OpenGL 3.3+ support (or Mesa software renderer).
Achieves 30+ fps real-time processing.
Compiles sexp effects to WGSL at load time, executes on GPU.
Achieves 30+ fps real-time processing on supported hardware.
TODO: Implement when ready for GPU acceleration.
Requirements:
- wgpu-py library
- Vulkan-capable GPU (or software renderer)
"""
def __init__(self):
raise NotImplementedError(
"GLSL backend not yet implemented. Use NumpyBackend for now."
)
def __init__(self, recipe_dir: Path = None):
self.recipe_dir = recipe_dir or Path(".")
self._device = None
self._loaded_effects: Dict[str, Any] = {} # name -> compiled shader info
self._numpy_fallback = NumpyBackend(recipe_dir)
# Buffer pool for reuse - keyed by (width, height)
self._buffer_pool: Dict[tuple, Dict] = {}
def _ensure_device(self):
"""Lazy-initialize wgpu device."""
if self._device is not None:
return
try:
import wgpu
adapter = wgpu.gpu.request_adapter_sync(power_preference="high-performance")
self._device = adapter.request_device_sync()
print(f"[WGPUBackend] Using GPU: {adapter.info.get('device', 'unknown')}")
except Exception as e:
print(f"[WGPUBackend] GPU init failed: {e}, falling back to CPU")
self._device = None
def load_effect(self, effect_path: Path) -> Any:
pass
"""Load and compile an effect from sexp file to WGSL."""
effect_key = str(effect_path)
if effect_key in self._loaded_effects:
return self._loaded_effects[effect_key]
try:
from sexp_effects.wgsl_compiler import compile_effect_file
compiled = compile_effect_file(str(effect_path))
self._ensure_device()
if self._device is None:
# Fall back to numpy
return self._numpy_fallback.load_effect(effect_path)
# Create shader module
import wgpu
shader_module = self._device.create_shader_module(code=compiled.wgsl_code)
# Create compute pipeline
pipeline = self._device.create_compute_pipeline(
layout="auto",
compute={"module": shader_module, "entry_point": "main"}
)
self._loaded_effects[effect_key] = {
'compiled': compiled,
'pipeline': pipeline,
'name': compiled.name,
}
return compiled.name
except Exception as e:
print(f"[WGPUBackend] Failed to compile {effect_path}: {e}")
# Fall back to numpy for this effect
return self._numpy_fallback.load_effect(effect_path)
def _resolve_binding(self, value: Any, t: float, analysis_data: Dict) -> Any:
"""Resolve a parameter binding to its value at time t."""
# Delegate to numpy backend's implementation
return self._numpy_fallback._resolve_binding(value, t, analysis_data)
def _get_or_create_buffers(self, w: int, h: int):
"""Get or create reusable buffers for given dimensions."""
import wgpu
key = (w, h)
if key in self._buffer_pool:
return self._buffer_pool[key]
size = w * h * 4 # u32 per pixel
# Create staging buffer for uploads (MAP_WRITE)
staging_buffer = self._device.create_buffer(
size=size,
usage=wgpu.BufferUsage.MAP_WRITE | wgpu.BufferUsage.COPY_SRC,
mapped_at_creation=False,
)
# Create input buffer (STORAGE, receives data from staging)
input_buffer = self._device.create_buffer(
size=size,
usage=wgpu.BufferUsage.STORAGE | wgpu.BufferUsage.COPY_DST,
)
# Create output buffer (STORAGE + COPY_SRC for readback)
output_buffer = self._device.create_buffer(
size=size,
usage=wgpu.BufferUsage.STORAGE | wgpu.BufferUsage.COPY_SRC,
)
# Params buffer (uniform, 256 bytes should be enough)
params_buffer = self._device.create_buffer(
size=256,
usage=wgpu.BufferUsage.UNIFORM | wgpu.BufferUsage.COPY_DST,
)
self._buffer_pool[key] = {
'staging': staging_buffer,
'input': input_buffer,
'output': output_buffer,
'params': params_buffer,
'size': size,
}
return self._buffer_pool[key]
def _apply_effect_gpu(
self,
frame: np.ndarray,
effect_name: str,
params: Dict,
t: float,
) -> Optional[np.ndarray]:
"""Apply effect using GPU. Returns None if GPU not available."""
import wgpu
# Find the loaded effect
effect_info = None
for key, info in self._loaded_effects.items():
if info.get('name') == effect_name:
effect_info = info
break
if effect_info is None or self._device is None:
return None
compiled = effect_info['compiled']
pipeline = effect_info['pipeline']
h, w = frame.shape[:2]
# Get reusable buffers
buffers = self._get_or_create_buffers(w, h)
# Pack frame as u32 array (RGB -> packed u32)
r = frame[:, :, 0].astype(np.uint32)
g = frame[:, :, 1].astype(np.uint32)
b = frame[:, :, 2].astype(np.uint32)
packed = (r << 16) | (g << 8) | b
input_data = packed.flatten().astype(np.uint32)
# Upload input data via queue.write_buffer (more efficient than recreation)
self._device.queue.write_buffer(buffers['input'], 0, input_data.tobytes())
# Build params struct
import struct
param_values = [w, h] # width, height as u32
param_format = "II" # two u32
# Add time as f32
param_values.append(t)
param_format += "f"
# Add effect-specific params
for param in compiled.params:
val = params.get(param.name, param.default)
if val is None:
val = 0
if param.wgsl_type == 'f32':
param_values.append(float(val))
param_format += "f"
elif param.wgsl_type == 'i32':
param_values.append(int(val))
param_format += "i"
elif param.wgsl_type == 'u32':
param_values.append(int(val))
param_format += "I"
# Pad to 16-byte alignment
param_bytes = struct.pack(param_format, *param_values)
while len(param_bytes) % 16 != 0:
param_bytes += b'\x00'
self._device.queue.write_buffer(buffers['params'], 0, param_bytes)
# Create bind group (unfortunately this can't be easily reused with different effects)
bind_group = self._device.create_bind_group(
layout=pipeline.get_bind_group_layout(0),
entries=[
{"binding": 0, "resource": {"buffer": buffers['input']}},
{"binding": 1, "resource": {"buffer": buffers['output']}},
{"binding": 2, "resource": {"buffer": buffers['params']}},
]
)
# Dispatch compute
encoder = self._device.create_command_encoder()
compute_pass = encoder.begin_compute_pass()
compute_pass.set_pipeline(pipeline)
compute_pass.set_bind_group(0, bind_group)
# Workgroups: ceil(w/16) x ceil(h/16)
wg_x = (w + 15) // 16
wg_y = (h + 15) // 16
compute_pass.dispatch_workgroups(wg_x, wg_y, 1)
compute_pass.end()
self._device.queue.submit([encoder.finish()])
# Read back result
result_data = self._device.queue.read_buffer(buffers['output'])
result_packed = np.frombuffer(result_data, dtype=np.uint32).reshape(h, w)
# Unpack u32 -> RGB
result = np.zeros((h, w, 3), dtype=np.uint8)
result[:, :, 0] = ((result_packed >> 16) & 0xFF).astype(np.uint8)
result[:, :, 1] = ((result_packed >> 8) & 0xFF).astype(np.uint8)
result[:, :, 2] = (result_packed & 0xFF).astype(np.uint8)
return result
def _apply_effect(
self,
frame: np.ndarray,
effect_name: str,
params: Dict,
t: float,
analysis_data: Dict,
) -> np.ndarray:
"""Apply a single effect to a frame."""
# Resolve bindings in params
resolved_params = {"_time": t}
for key, value in params.items():
if key in ("effect", "effect_path", "cid", "analysis_refs"):
continue
resolved_params[key] = self._resolve_binding(value, t, analysis_data)
# Try GPU first
self._ensure_device()
if self._device is not None:
result = self._apply_effect_gpu(frame, effect_name, resolved_params, t)
if result is not None:
return result
# Fall back to numpy
return self._numpy_fallback._apply_effect(
frame, effect_name, params, t, analysis_data
)
def process_frame(
self,
@@ -286,7 +523,34 @@ class GLSLBackend(Backend):
t: float,
analysis_data: Dict,
) -> np.ndarray:
pass
"""Process frames through effects and composite."""
if not frames:
return np.zeros((720, 1280, 3), dtype=np.uint8)
processed = []
# Apply effects to each input frame
for i, (frame, effects) in enumerate(zip(frames, effects_per_frame)):
result = frame.copy()
for effect_config in effects:
effect_name = effect_config.get("effect", "")
if effect_name:
result = self._apply_effect(
result, effect_name, effect_config, t, analysis_data
)
processed.append(result)
# Composite layers (use numpy backend for now)
if len(processed) == 1:
return processed[0]
return self._numpy_fallback._composite(
processed, compositor_config, t, analysis_data
)
# Keep GLSLBackend as alias for backwards compatibility
GLSLBackend = WGPUBackend
def get_backend(name: str = "numpy", **kwargs) -> Backend:
@@ -294,7 +558,7 @@ def get_backend(name: str = "numpy", **kwargs) -> Backend:
Get a backend by name.
Args:
name: "numpy" or "glsl"
name: "numpy", "wgpu", or "glsl" (alias for wgpu)
**kwargs: Backend-specific options
Returns:
@@ -302,7 +566,7 @@ def get_backend(name: str = "numpy", **kwargs) -> Backend:
"""
if name == "numpy":
return NumpyBackend(**kwargs)
elif name == "glsl":
return GLSLBackend(**kwargs)
elif name in ("wgpu", "glsl", "gpu"):
return WGPUBackend(**kwargs)
else:
raise ValueError(f"Unknown backend: {name}")

View File

@@ -5,14 +5,99 @@ Supports:
- Display window (preview)
- File output (recording)
- Stream output (RTMP, etc.) - future
- NVENC hardware encoding (auto-detected)
- CuPy GPU arrays (auto-converted to numpy for output)
"""
import numpy as np
import subprocess
from abc import ABC, abstractmethod
from typing import Tuple, Optional
from typing import Tuple, Optional, List, Union
from pathlib import Path
# Try to import CuPy for GPU array support
try:
import cupy as cp
CUPY_AVAILABLE = True
except ImportError:
cp = None
CUPY_AVAILABLE = False
def ensure_numpy(frame: Union[np.ndarray, 'cp.ndarray']) -> np.ndarray:
"""Convert frame to numpy array if it's a CuPy array."""
if CUPY_AVAILABLE and isinstance(frame, cp.ndarray):
return cp.asnumpy(frame)
return frame
# Cache NVENC availability check
_nvenc_available: Optional[bool] = None
def check_nvenc_available() -> bool:
"""Check if NVENC hardware encoding is available."""
global _nvenc_available
if _nvenc_available is not None:
return _nvenc_available
try:
result = subprocess.run(
["ffmpeg", "-encoders"],
capture_output=True,
text=True,
timeout=5
)
_nvenc_available = "h264_nvenc" in result.stdout
except Exception:
_nvenc_available = False
return _nvenc_available
def get_encoder_params(codec: str, preset: str, crf: int) -> List[str]:
"""
Get encoder-specific FFmpeg parameters.
For NVENC (h264_nvenc, hevc_nvenc):
- Uses -cq for constant quality (similar to CRF)
- Presets: p1 (fastest) to p7 (slowest/best quality)
- Mapping: fast->p4, medium->p5, slow->p6
For libx264:
- Uses -crf for constant rate factor
- Presets: ultrafast, superfast, veryfast, faster, fast, medium, slow, slower, veryslow
"""
if codec in ("h264_nvenc", "hevc_nvenc"):
# Map libx264 presets to NVENC presets
nvenc_preset_map = {
"ultrafast": "p1",
"superfast": "p2",
"veryfast": "p3",
"faster": "p3",
"fast": "p4",
"medium": "p5",
"slow": "p6",
"slower": "p6",
"veryslow": "p7",
}
nvenc_preset = nvenc_preset_map.get(preset, "p4")
# NVENC quality: 0 (best) to 51 (worst), similar to CRF
# CRF 18 = high quality, CRF 23 = good quality
return [
"-c:v", codec,
"-preset", nvenc_preset,
"-cq", str(crf), # Constant quality mode
"-rc", "vbr", # Variable bitrate with quality target
]
else:
# Standard libx264 params
return [
"-c:v", codec,
"-preset", preset,
"-crf", str(crf),
]
class Output(ABC):
"""Abstract base class for output targets."""
@@ -91,6 +176,9 @@ class DisplayOutput(Output):
if not self._is_open:
return
# Convert GPU array to numpy if needed
frame = ensure_numpy(frame)
# Ensure frame is correct format
if frame.dtype != np.uint8:
frame = np.clip(frame, 0, 255).astype(np.uint8)
@@ -136,6 +224,9 @@ class DisplayOutput(Output):
class FileOutput(Output):
"""
Write frames to a video file using ffmpeg.
Automatically uses NVENC hardware encoding when available,
falling back to libx264 CPU encoding otherwise.
"""
def __init__(
@@ -143,7 +234,7 @@ class FileOutput(Output):
path: str,
size: Tuple[int, int],
fps: float = 30,
codec: str = "libx264",
codec: str = "auto", # "auto", "h264_nvenc", "libx264"
crf: int = 18,
preset: str = "fast",
audio_source: str = None,
@@ -153,6 +244,11 @@ class FileOutput(Output):
self.fps = fps
self._is_open = True
# Auto-detect NVENC
if codec == "auto":
codec = "h264_nvenc" if check_nvenc_available() else "libx264"
self.codec = codec
# Build ffmpeg command
cmd = [
"ffmpeg", "-y",
@@ -170,12 +266,9 @@ class FileOutput(Output):
# Explicitly map: video from input 0 (rawvideo), audio from input 1
cmd.extend(["-map", "0:v", "-map", "1:a"])
cmd.extend([
"-c:v", codec,
"-preset", preset,
"-crf", str(crf),
"-pix_fmt", "yuv420p",
])
# Get encoder-specific params
cmd.extend(get_encoder_params(codec, preset, crf))
cmd.extend(["-pix_fmt", "yuv420p"])
# Add audio codec if we have audio
if audio_source:
@@ -201,11 +294,20 @@ class FileOutput(Output):
self._is_open = False
return
# Convert GPU array to numpy if needed
frame = ensure_numpy(frame)
# Resize if needed
if frame.shape[1] != self.size[0] or frame.shape[0] != self.size[1]:
import cv2
frame = cv2.resize(frame, self.size)
# Ensure correct format
if frame.dtype != np.uint8:
frame = np.clip(frame, 0, 255).astype(np.uint8)
if not frame.flags['C_CONTIGUOUS']:
frame = np.ascontiguousarray(frame)
try:
self._process.stdin.write(frame.tobytes())
except BrokenPipeError:
@@ -335,6 +437,9 @@ class PipeOutput(Output):
self._is_open = False
return
# Convert GPU array to numpy if needed
frame = ensure_numpy(frame)
# Resize if needed
if frame.shape[1] != self.size[0] or frame.shape[0] != self.size[1]:
import cv2
@@ -371,3 +476,424 @@ class PipeOutput(Output):
if self._process and self._process.poll() is not None:
self._is_open = False
return self._is_open
class HLSOutput(Output):
"""
Write frames as HLS stream (m3u8 playlist + .ts segments).
This enables true live streaming where the browser can poll
for new segments as they become available.
Automatically uses NVENC hardware encoding when available.
"""
def __init__(
self,
output_dir: str,
size: Tuple[int, int],
fps: float = 30,
segment_duration: float = 4.0, # 4s segments for stability
codec: str = "auto", # "auto", "h264_nvenc", "libx264"
crf: int = 23,
preset: str = "fast", # Better quality than ultrafast
audio_source: str = None,
):
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
self.size = size
self.fps = fps
self.segment_duration = segment_duration
self._is_open = True
# Auto-detect NVENC
if codec == "auto":
codec = "h264_nvenc" if check_nvenc_available() else "libx264"
self.codec = codec
# HLS playlist path
self.playlist_path = self.output_dir / "stream.m3u8"
# Build ffmpeg command for HLS output
cmd = [
"ffmpeg", "-y",
"-f", "rawvideo",
"-vcodec", "rawvideo",
"-pix_fmt", "rgb24",
"-s", f"{size[0]}x{size[1]}",
"-r", str(fps),
"-i", "-",
]
# Add audio input if provided
if audio_source:
cmd.extend(["-i", str(audio_source)])
cmd.extend(["-map", "0:v", "-map", "1:a"])
# Keyframe interval - must be exactly segment_duration for clean cuts
gop_size = int(fps * segment_duration)
# Get encoder-specific params
cmd.extend(get_encoder_params(codec, preset, crf))
cmd.extend([
"-pix_fmt", "yuv420p",
# Force keyframes at exact intervals for clean segment boundaries
"-g", str(gop_size),
"-keyint_min", str(gop_size),
"-sc_threshold", "0", # Disable scene change detection
"-force_key_frames", f"expr:gte(t,n_forced*{segment_duration})",
# Reduce buffering for faster segment availability
"-flush_packets", "1",
])
# Add audio codec if we have audio
if audio_source:
cmd.extend(["-c:a", "aac", "-b:a", "128k"])
# HLS specific options for smooth live streaming
cmd.extend([
"-f", "hls",
"-hls_time", str(segment_duration),
"-hls_list_size", "0", # Keep all segments in playlist
"-hls_flags", "independent_segments+append_list+split_by_time",
"-hls_segment_type", "mpegts",
"-hls_segment_filename", str(self.output_dir / "segment_%05d.ts"),
str(self.playlist_path),
])
import sys
print(f"HLSOutput cmd: {' '.join(cmd)}", file=sys.stderr)
self._process = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stderr=None, # Show errors for debugging
)
# Track segments for status reporting
self.segments_written = 0
self._last_segment_check = 0
def write(self, frame: np.ndarray, t: float):
"""Write frame to HLS stream."""
if not self._is_open or self._process.poll() is not None:
self._is_open = False
return
# Convert GPU array to numpy if needed
frame = ensure_numpy(frame)
# Resize if needed
if frame.shape[1] != self.size[0] or frame.shape[0] != self.size[1]:
import cv2
frame = cv2.resize(frame, self.size)
# Ensure correct format
if frame.dtype != np.uint8:
frame = np.clip(frame, 0, 255).astype(np.uint8)
if not frame.flags['C_CONTIGUOUS']:
frame = np.ascontiguousarray(frame)
try:
self._process.stdin.write(frame.tobytes())
except BrokenPipeError:
self._is_open = False
# Periodically count segments
if t - self._last_segment_check > 1.0:
self._last_segment_check = t
self.segments_written = len(list(self.output_dir.glob("segment_*.ts")))
def close(self):
"""Close the HLS stream."""
if self._process:
self._process.stdin.close()
self._process.wait()
self._is_open = False
# Final segment count
self.segments_written = len(list(self.output_dir.glob("segment_*.ts")))
# Mark playlist as ended (VOD mode)
if self.playlist_path.exists():
with open(self.playlist_path, "a") as f:
f.write("#EXT-X-ENDLIST\n")
@property
def is_open(self) -> bool:
return self._is_open and self._process.poll() is None
class IPFSHLSOutput(Output):
"""
Write frames as HLS stream with segments uploaded to IPFS.
Each segment is uploaded to IPFS as it's created, enabling distributed
streaming where clients can fetch segments from any IPFS gateway.
The m3u8 playlist is continuously updated with IPFS URLs and can be
fetched via get_playlist() or the playlist_cid property.
"""
def __init__(
self,
output_dir: str,
size: Tuple[int, int],
fps: float = 30,
segment_duration: float = 4.0,
codec: str = "auto",
crf: int = 23,
preset: str = "fast",
audio_source: str = None,
ipfs_gateway: str = "https://ipfs.io/ipfs",
):
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
self.size = size
self.fps = fps
self.segment_duration = segment_duration
self.ipfs_gateway = ipfs_gateway.rstrip("/")
self._is_open = True
# Auto-detect NVENC
if codec == "auto":
codec = "h264_nvenc" if check_nvenc_available() else "libx264"
self.codec = codec
# Track segment CIDs
self.segment_cids: dict = {} # segment_number -> cid
self._last_segment_checked = -1
self._playlist_cid: Optional[str] = None
# Import IPFS client
from ipfs_client import add_file, add_bytes
self._ipfs_add_file = add_file
self._ipfs_add_bytes = add_bytes
# Local HLS paths
self.local_playlist_path = self.output_dir / "stream.m3u8"
# Build ffmpeg command for HLS output
cmd = [
"ffmpeg", "-y",
"-f", "rawvideo",
"-vcodec", "rawvideo",
"-pix_fmt", "rgb24",
"-s", f"{size[0]}x{size[1]}",
"-r", str(fps),
"-i", "-",
]
# Add audio input if provided
if audio_source:
cmd.extend(["-i", str(audio_source)])
cmd.extend(["-map", "0:v", "-map", "1:a"])
# Keyframe interval
gop_size = int(fps * segment_duration)
# Get encoder-specific params
cmd.extend(get_encoder_params(codec, preset, crf))
cmd.extend([
"-pix_fmt", "yuv420p",
"-g", str(gop_size),
"-keyint_min", str(gop_size),
"-sc_threshold", "0",
"-force_key_frames", f"expr:gte(t,n_forced*{segment_duration})",
"-flush_packets", "1",
])
# Add audio codec if we have audio
if audio_source:
cmd.extend(["-c:a", "aac", "-b:a", "128k"])
# HLS options
cmd.extend([
"-f", "hls",
"-hls_time", str(segment_duration),
"-hls_list_size", "0",
"-hls_flags", "independent_segments+append_list+split_by_time",
"-hls_segment_type", "mpegts",
"-hls_segment_filename", str(self.output_dir / "segment_%05d.ts"),
str(self.local_playlist_path),
])
import sys
print(f"IPFSHLSOutput: starting ffmpeg", file=sys.stderr)
self._process = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stderr=None,
)
def _upload_new_segments(self):
"""Check for new segments and upload them to IPFS."""
import sys
# Find all segments
segments = sorted(self.output_dir.glob("segment_*.ts"))
for seg_path in segments:
# Extract segment number from filename
seg_name = seg_path.stem # segment_00000
seg_num = int(seg_name.split("_")[1])
# Skip if already uploaded
if seg_num in self.segment_cids:
continue
# Skip if segment is still being written (check if file size is stable)
try:
size1 = seg_path.stat().st_size
if size1 == 0:
continue # Empty file, still being created
import time
time.sleep(0.1)
size2 = seg_path.stat().st_size
if size1 != size2:
continue # File still being written
except FileNotFoundError:
continue
# Upload to IPFS
cid = self._ipfs_add_file(seg_path, pin=True)
if cid:
self.segment_cids[seg_num] = cid
print(f"IPFS: segment_{seg_num:05d}.ts -> {cid}", file=sys.stderr)
# Update playlist after each segment upload
self._update_ipfs_playlist()
def _update_ipfs_playlist(self):
"""Generate and upload IPFS-aware m3u8 playlist."""
if not self.segment_cids:
return
import sys
# Build m3u8 content with IPFS URLs
lines = [
"#EXTM3U",
"#EXT-X-VERSION:3",
f"#EXT-X-TARGETDURATION:{int(self.segment_duration) + 1}",
"#EXT-X-MEDIA-SEQUENCE:0",
]
# Add segments in order
for seg_num in sorted(self.segment_cids.keys()):
cid = self.segment_cids[seg_num]
lines.append(f"#EXTINF:{self.segment_duration:.3f},")
lines.append(f"{self.ipfs_gateway}/{cid}")
playlist_content = "\n".join(lines) + "\n"
# Upload playlist to IPFS
cid = self._ipfs_add_bytes(playlist_content.encode("utf-8"), pin=True)
if cid:
self._playlist_cid = cid
print(f"IPFS: playlist updated -> {cid} ({len(self.segment_cids)} segments)", file=sys.stderr)
def write(self, frame: np.ndarray, t: float):
"""Write frame to HLS stream and upload segments to IPFS."""
if not self._is_open or self._process.poll() is not None:
self._is_open = False
return
# Convert GPU array to numpy if needed
frame = ensure_numpy(frame)
# Resize if needed
if frame.shape[1] != self.size[0] or frame.shape[0] != self.size[1]:
import cv2
frame = cv2.resize(frame, self.size)
# Ensure correct format
if frame.dtype != np.uint8:
frame = np.clip(frame, 0, 255).astype(np.uint8)
if not frame.flags['C_CONTIGUOUS']:
frame = np.ascontiguousarray(frame)
try:
self._process.stdin.write(frame.tobytes())
except BrokenPipeError:
self._is_open = False
return
# Check for new segments periodically (every second)
current_segment = int(t / self.segment_duration)
if current_segment > self._last_segment_checked:
self._last_segment_checked = current_segment
self._upload_new_segments()
def close(self):
"""Close the HLS stream and finalize IPFS uploads."""
import sys
if self._process:
self._process.stdin.close()
self._process.wait()
self._is_open = False
# Upload any remaining segments
self._upload_new_segments()
# Generate final playlist with #EXT-X-ENDLIST
if self.segment_cids:
lines = [
"#EXTM3U",
"#EXT-X-VERSION:3",
f"#EXT-X-TARGETDURATION:{int(self.segment_duration) + 1}",
"#EXT-X-MEDIA-SEQUENCE:0",
"#EXT-X-PLAYLIST-TYPE:VOD",
]
for seg_num in sorted(self.segment_cids.keys()):
cid = self.segment_cids[seg_num]
lines.append(f"#EXTINF:{self.segment_duration:.3f},")
lines.append(f"{self.ipfs_gateway}/{cid}")
lines.append("#EXT-X-ENDLIST")
playlist_content = "\n".join(lines) + "\n"
cid = self._ipfs_add_bytes(playlist_content.encode("utf-8"), pin=True)
if cid:
self._playlist_cid = cid
print(f"IPFS: final playlist -> {cid} ({len(self.segment_cids)} segments)", file=sys.stderr)
@property
def playlist_cid(self) -> Optional[str]:
"""Get the current playlist CID."""
return self._playlist_cid
@property
def playlist_url(self) -> Optional[str]:
"""Get the full IPFS URL for the playlist."""
if self._playlist_cid:
return f"{self.ipfs_gateway}/{self._playlist_cid}"
return None
def get_playlist(self) -> str:
"""Get the current m3u8 playlist content with IPFS URLs."""
if not self.segment_cids:
return "#EXTM3U\n"
lines = [
"#EXTM3U",
"#EXT-X-VERSION:3",
f"#EXT-X-TARGETDURATION:{int(self.segment_duration) + 1}",
"#EXT-X-MEDIA-SEQUENCE:0",
]
for seg_num in sorted(self.segment_cids.keys()):
cid = self.segment_cids[seg_num]
lines.append(f"#EXTINF:{self.segment_duration:.3f},")
lines.append(f"{self.ipfs_gateway}/{cid}")
if not self._is_open:
lines.append("#EXT-X-ENDLIST")
return "\n".join(lines) + "\n"
@property
def is_open(self) -> bool:
return self._is_open and self._process.poll() is None

View File

@@ -159,36 +159,51 @@ class StreamInterpreter:
return config
def _load_primitives(self, lib_name: str):
"""Load primitives from a Python library file."""
"""Load primitives from a Python library file.
Prefers GPU-accelerated versions (*_gpu.py) when available.
"""
import importlib.util
lib_paths = [
self.primitive_lib_dir / f"{lib_name}.py",
self.sexp_dir / "primitive_libs" / f"{lib_name}.py",
self.sexp_dir.parent / "sexp_effects" / "primitive_libs" / f"{lib_name}.py",
]
# Try GPU version first, then fall back to CPU version
lib_names_to_try = [f"{lib_name}_gpu", lib_name]
lib_path = None
actual_lib_name = lib_name
for try_lib in lib_names_to_try:
lib_paths = [
self.primitive_lib_dir / f"{try_lib}.py",
self.sexp_dir / "primitive_libs" / f"{try_lib}.py",
self.sexp_dir.parent / "sexp_effects" / "primitive_libs" / f"{try_lib}.py",
]
for p in lib_paths:
if p.exists():
lib_path = p
actual_lib_name = try_lib
break
if lib_path:
break
if not lib_path:
print(f"Warning: primitive library '{lib_name}' not found", file=sys.stderr)
return
spec = importlib.util.spec_from_file_location(lib_name, lib_path)
spec = importlib.util.spec_from_file_location(actual_lib_name, lib_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
# Check if this is a GPU-accelerated module
is_gpu = actual_lib_name.endswith('_gpu')
gpu_tag = " [GPU]" if is_gpu else ""
count = 0
for name in dir(module):
if name.startswith('prim_'):
func = getattr(module, name)
prim_name = name[5:]
dash_name = prim_name.replace('_', '-')
# Register ONLY with namespace (geometry:ripple-displace)
# Register with original lib_name namespace (geometry:rotate, not geometry_gpu:rotate)
# Don't overwrite if already registered (allows pre-registration of overrides)
key = f"{lib_name}:{dash_name}"
if key not in self.primitives:
@@ -199,7 +214,7 @@ class StreamInterpreter:
prims = getattr(module, 'PRIMITIVES')
if isinstance(prims, dict):
for name, func in prims.items():
# Register ONLY with namespace
# Register with original lib_name namespace
# Don't overwrite if already registered
dash_name = name.replace('_', '-')
key = f"{lib_name}:{dash_name}"
@@ -207,7 +222,7 @@ class StreamInterpreter:
self.primitives[key] = func
count += 1
print(f"Loaded primitives: {lib_name} ({count} functions)", file=sys.stderr)
print(f"Loaded primitives: {lib_name} ({count} functions){gpu_tag}", file=sys.stderr)
def _load_effect(self, effect_path: Path):
"""Load and register an effect from a .sexp file."""
@@ -807,8 +822,11 @@ class StreamInterpreter:
self._record_error(f"Primitive {op} error: {e}")
raise RuntimeError(f"Primitive {op} failed: {e}")
# Unknown - return as-is
return expr
# Unknown function call - raise meaningful error
raise RuntimeError(f"Unknown function or primitive: '{op}'. "
f"Available primitives: {sorted(list(self.primitives.keys())[:10])}... "
f"Available effects: {sorted(list(self.effects.keys())[:10])}... "
f"Available macros: {sorted(list(self.macros.keys())[:10])}...")
def _step_scans(self, ctx: Context, env: dict):
"""Step scans based on trigger evaluation."""
@@ -833,9 +851,9 @@ class StreamInterpreter:
"""Run the streaming pipeline."""
# Import output classes - handle both package and direct execution
try:
from .output import PipeOutput, DisplayOutput, FileOutput
from .output import PipeOutput, DisplayOutput, FileOutput, HLSOutput, IPFSHLSOutput
except ImportError:
from output import PipeOutput, DisplayOutput, FileOutput
from output import PipeOutput, DisplayOutput, FileOutput, HLSOutput, IPFSHLSOutput
self._init()
@@ -871,6 +889,16 @@ class StreamInterpreter:
out = PipeOutput(size=(w, h), fps=fps, audio_source=audio)
elif output == "preview":
out = DisplayOutput(size=(w, h), fps=fps, audio_source=audio)
elif output.endswith("/hls"):
# HLS output - output is a directory path ending in /hls
hls_dir = output[:-4] # Remove /hls suffix
out = HLSOutput(hls_dir, size=(w, h), fps=fps, audio_source=audio)
elif output.endswith("/ipfs-hls"):
# IPFS HLS output - segments uploaded to IPFS as they're created
hls_dir = output[:-9] # Remove /ipfs-hls suffix
import os
ipfs_gateway = os.environ.get("IPFS_GATEWAY_URL", "https://ipfs.io/ipfs")
out = IPFSHLSOutput(hls_dir, size=(w, h), fps=fps, audio_source=audio, ipfs_gateway=ipfs_gateway)
else:
out = FileOutput(output, size=(w, h), fps=fps, audio_source=audio)
@@ -916,6 +944,8 @@ class StreamInterpreter:
finally:
out.close()
# Store output for access to properties like playlist_cid
self.output = out
print("\nDone", file=sys.stderr)

View File

@@ -69,6 +69,16 @@ def upload_to_ipfs(self, local_cid: str, actor_id: str) -> Optional[str]:
database.update_cache_item_ipfs_cid(local_cid, ipfs_cid)
)
# Update friendly_names table to use IPFS CID instead of local hash
# This ensures assets can be fetched by remote workers via IPFS
try:
loop.run_until_complete(
database.update_friendly_name_cid(actor_id, local_cid, ipfs_cid)
)
logger.info(f"Friendly name updated: {local_cid[:16]}... -> {ipfs_cid[:16]}...")
except Exception as e:
logger.warning(f"Failed to update friendly name CID: {e}")
# Create index from IPFS CID to local cache
cache_mgr._set_content_index(ipfs_cid, local_cid)

View File

@@ -83,6 +83,28 @@ def resolve_asset(ref: str, actor_id: Optional[str] = None) -> Optional[Path]:
print(f"RESOLVE_ASSET: SUCCESS - resolved to {path}", file=sys.stderr)
logger.info(f"Resolved '{ref}' via friendly name to {path}")
return path
# File not in local cache - try fetching from IPFS
# The CID from friendly_names is an IPFS CID
print(f"RESOLVE_ASSET: file not local, trying IPFS fetch for {cid}", file=sys.stderr)
import ipfs_client
content = ipfs_client.get_bytes(cid, use_gateway_fallback=True)
if content:
# Save to local cache
import tempfile
from pathlib import Path
with tempfile.NamedTemporaryFile(delete=False, suffix='.sexp') as tmp:
tmp.write(content)
tmp_path = Path(tmp.name)
# Store in cache
cached_file, _ = cache_mgr.put(tmp_path, node_type="effect", skip_ipfs=True)
# Index by IPFS CID for future lookups
cache_mgr._set_content_index(cid, cached_file.cid)
print(f"RESOLVE_ASSET: fetched from IPFS and cached at {cached_file.path}", file=sys.stderr)
logger.info(f"Fetched '{ref}' from IPFS and cached at {cached_file.path}")
return cached_file.path
else:
print(f"RESOLVE_ASSET: IPFS fetch failed for {cid}", file=sys.stderr)
except Exception as e:
print(f"RESOLVE_ASSET: ERROR - {e}", file=sys.stderr)
logger.warning(f"Failed to resolve friendly name '{ref}': {e}")
@@ -260,7 +282,8 @@ def run_stream(
cache_dir = Path(os.environ.get("CACHE_DIR", "/data/cache"))
stream_dir = cache_dir / "streaming" / run_id
stream_dir.mkdir(parents=True, exist_ok=True)
output_path = stream_dir / "output.mp4" # Always mp4 for streaming
# Use IPFS HLS output for distributed streaming - segments uploaded to IPFS
output_path = str(stream_dir) + "/ipfs-hls" # /ipfs-hls suffix triggers IPFS HLS mode
# Create symlinks to effect directories so relative paths work
(work_dir / "sexp_effects").symlink_to(sexp_effects_dir)
@@ -320,15 +343,50 @@ def run_stream(
self.update_state(state='CACHING', meta={'progress': 90})
# Validate output file (must be > 1KB to have actual frames)
if output_path.exists() and output_path.stat().st_size < 1024:
raise RuntimeError(f"Output file is too small ({output_path.stat().st_size} bytes) - rendering likely failed")
# Get IPFS playlist CID if available (from IPFSHLSOutput)
ipfs_playlist_cid = None
ipfs_playlist_url = None
segment_cids = {}
if hasattr(interp, 'output') and hasattr(interp.output, 'playlist_cid'):
ipfs_playlist_cid = interp.output.playlist_cid
ipfs_playlist_url = interp.output.playlist_url
segment_cids = getattr(interp.output, 'segment_cids', {})
logger.info(f"IPFS HLS: playlist={ipfs_playlist_cid}, segments={len(segment_cids)}")
# HLS output creates stream.m3u8 and segment_*.ts files in stream_dir
hls_playlist = stream_dir / "stream.m3u8"
# Validate HLS output (must have playlist and at least one segment)
if not hls_playlist.exists():
raise RuntimeError("HLS playlist not created - rendering likely failed")
segments = list(stream_dir.glob("segment_*.ts"))
if not segments:
raise RuntimeError("No HLS segments created - rendering likely failed")
logger.info(f"HLS rendering complete: {len(segments)} segments created, IPFS playlist: {ipfs_playlist_cid}")
# Mux HLS segments into a single MP4 for persistent cache storage
final_mp4 = stream_dir / "output.mp4"
import subprocess
mux_cmd = [
"ffmpeg", "-y",
"-i", str(hls_playlist),
"-c", "copy", # Just copy streams, no re-encoding
str(final_mp4)
]
logger.info(f"Muxing HLS to MP4: {' '.join(mux_cmd)}")
result = subprocess.run(mux_cmd, capture_output=True, text=True)
if result.returncode != 0:
logger.warning(f"HLS mux failed: {result.stderr}")
# Fall back to using the first segment for caching
final_mp4 = segments[0]
# Store output in cache
if output_path.exists():
if final_mp4.exists():
cache_mgr = get_cache_manager()
cached_file, ipfs_cid = cache_mgr.put(
source_path=output_path,
source_path=final_mp4,
node_type="STREAM_OUTPUT",
node_id=f"stream_{task_id}",
)
@@ -365,6 +423,15 @@ def run_stream(
ipfs_cid=ipfs_cid,
actor_id=actor_id,
))
# Register output as video type so frontend displays it correctly
_resolve_loop.run_until_complete(database.add_item_type(
cid=cached_file.cid,
actor_id=actor_id,
item_type="video",
path=str(cached_file.path),
description=f"Stream output from run {run_id}",
))
logger.info(f"Registered output {cached_file.cid} as video type")
# Update pending run status
_resolve_loop.run_until_complete(database.update_pending_run_status(
run_id=run_id,
@@ -381,6 +448,10 @@ def run_stream(
"output_cid": cached_file.cid,
"ipfs_cid": ipfs_cid,
"output_path": str(cached_file.path),
# IPFS HLS streaming info
"ipfs_playlist_cid": ipfs_playlist_cid,
"ipfs_playlist_url": ipfs_playlist_url,
"ipfs_segment_count": len(segment_cids),
}
else:
# Update pending run status to failed - reuse module loop