Add CI/CD workflow
Some checks failed
Build and Deploy / build-and-deploy (push) Failing after 1m2s
GPU Worker CI/CD / test (push) Failing after 1m4s
GPU Worker CI/CD / deploy (push) Has been skipped

This commit is contained in:
giles
2026-02-06 10:44:13 +00:00
parent 7411aa74c4
commit 48018d09b7
11 changed files with 934 additions and 20 deletions

91
.gitea/workflows/ci.yml Normal file
View File

@@ -0,0 +1,91 @@
name: Build and Deploy
on:
push:
branches: [main]
env:
REGISTRY: registry.rose-ash.com:5000
IMAGE_CPU: celery-l1-server
IMAGE_GPU: celery-l1-gpu-server
jobs:
build-and-deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up SSH
env:
SSH_KEY: ${{ secrets.DEPLOY_SSH_KEY }}
DEPLOY_HOST: ${{ secrets.DEPLOY_HOST }}
run: |
mkdir -p ~/.ssh
echo "$SSH_KEY" > ~/.ssh/id_rsa
chmod 600 ~/.ssh/id_rsa
ssh-keyscan -H "$DEPLOY_HOST" >> ~/.ssh/known_hosts 2>/dev/null || true
- name: Sync code to server
env:
DEPLOY_HOST: ${{ secrets.DEPLOY_HOST }}
run: |
rsync -avz --delete \
--exclude '.git' \
--exclude '__pycache__' \
--exclude '*.pyc' \
--exclude '.pytest_cache' \
--exclude 'venv' \
./ "root@$DEPLOY_HOST:/root/art-dag/celery/"
- name: Build and push CPU image
env:
DEPLOY_HOST: ${{ secrets.DEPLOY_HOST }}
run: |
ssh "root@$DEPLOY_HOST" "
cd /root/art-dag/celery
docker build --build-arg CACHEBUST=\$(date +%s) -t ${{ env.REGISTRY }}/${{ env.IMAGE_CPU }}:latest -t ${{ env.REGISTRY }}/${{ env.IMAGE_CPU }}:${{ github.sha }} .
docker push ${{ env.REGISTRY }}/${{ env.IMAGE_CPU }}:latest
docker push ${{ env.REGISTRY }}/${{ env.IMAGE_CPU }}:${{ github.sha }}
"
- name: Build and push GPU image
env:
DEPLOY_HOST: ${{ secrets.DEPLOY_HOST }}
run: |
ssh "root@$DEPLOY_HOST" "
cd /root/art-dag/celery
docker build --build-arg CACHEBUST=\$(date +%s) -f Dockerfile.gpu -t ${{ env.REGISTRY }}/${{ env.IMAGE_GPU }}:latest -t ${{ env.REGISTRY }}/${{ env.IMAGE_GPU }}:${{ github.sha }} .
docker push ${{ env.REGISTRY }}/${{ env.IMAGE_GPU }}:latest
docker push ${{ env.REGISTRY }}/${{ env.IMAGE_GPU }}:${{ github.sha }}
"
- name: Deploy stack
env:
DEPLOY_HOST: ${{ secrets.DEPLOY_HOST }}
run: |
ssh "root@$DEPLOY_HOST" "
cd /root/art-dag/celery
docker stack deploy -c docker-compose.yml celery
echo 'Waiting for services to update...'
sleep 10
docker stack services celery
"
- name: Deploy GPU worker
env:
GPU_HOST: ${{ secrets.GPU_HOST }}
SSH_KEY: ${{ secrets.GPU_SSH_KEY }}
if: ${{ env.GPU_HOST != '' }}
run: |
# Set up GPU SSH if different host
if [ -n "$SSH_KEY" ]; then
echo "$SSH_KEY" > ~/.ssh/gpu_key
chmod 600 ~/.ssh/gpu_key
ssh-keyscan -H "${GPU_HOST#*@}" >> ~/.ssh/known_hosts 2>/dev/null || true
ssh -i ~/.ssh/gpu_key "$GPU_HOST" "
docker pull ${{ env.REGISTRY }}/${{ env.IMAGE_GPU }}:latest
docker stack deploy -c /root/art-dag/celery/docker-compose.yml celery || \
docker service update --image ${{ env.REGISTRY }}/${{ env.IMAGE_GPU }}:latest celery_l1-gpu-worker
"
fi

View File

@@ -1071,6 +1071,159 @@ async def purge_failed_runs(
return {"purged": len(deleted), "run_ids": deleted} return {"purged": len(deleted), "run_ids": deleted}
@router.post("/{run_id}/pause")
async def pause_run(
run_id: str,
request: Request,
ctx: UserContext = Depends(require_auth),
):
"""Pause a running render. Waits for current segment to complete.
The render will checkpoint at the next segment boundary and stop.
"""
import database
from celery_app import app as celery_app
await database.init_db()
pending = await database.get_pending_run(run_id)
if not pending:
raise HTTPException(404, "Run not found")
if pending['status'] != 'running':
raise HTTPException(400, f"Can only pause running renders (current status: {pending['status']})")
# Revoke the Celery task (soft termination via SIGTERM - allows cleanup)
celery_task_id = pending.get('celery_task_id')
if celery_task_id:
celery_app.control.revoke(celery_task_id, terminate=True, signal='SIGTERM')
logger.info(f"Sent SIGTERM to task {celery_task_id} for run {run_id}")
# Update status to 'paused'
await database.update_pending_run_status(run_id, 'paused')
return {
"run_id": run_id,
"status": "paused",
"checkpoint_frame": pending.get('checkpoint_frame'),
}
@router.post("/{run_id}/resume")
async def resume_run(
run_id: str,
request: Request,
ctx: UserContext = Depends(require_auth),
):
"""Resume a paused or failed run from its last checkpoint.
The render will continue from the checkpoint frame.
"""
import database
from tasks.streaming import run_stream
await database.init_db()
pending = await database.get_pending_run(run_id)
if not pending:
raise HTTPException(404, "Run not found")
if pending['status'] not in ('failed', 'paused'):
raise HTTPException(400, f"Can only resume failed/paused runs (current status: {pending['status']})")
if not pending.get('checkpoint_frame'):
raise HTTPException(400, "No checkpoint available - use restart instead")
if not pending.get('resumable', True):
raise HTTPException(400, "Run checkpoint is corrupted - use restart instead")
# Submit new Celery task with resume=True
task = run_stream.apply_async(
kwargs=dict(
run_id=run_id,
recipe_sexp=pending.get('dag_json', ''), # Recipe is stored in dag_json
output_name=pending.get('output_name', 'output.mp4'),
actor_id=pending.get('actor_id'),
resume=True,
),
queue='gpu',
)
# Update status and celery_task_id
await database.update_pending_run_status(run_id, 'running')
# Update the celery_task_id manually since create_pending_run isn't called
async with database.pool.acquire() as conn:
await conn.execute(
"UPDATE pending_runs SET celery_task_id = $2, updated_at = NOW() WHERE run_id = $1",
run_id, task.id
)
logger.info(f"Resumed run {run_id} from frame {pending.get('checkpoint_frame')} with task {task.id}")
return {
"run_id": run_id,
"status": "running",
"celery_task_id": task.id,
"resumed_from_frame": pending.get('checkpoint_frame'),
}
@router.post("/{run_id}/restart")
async def restart_run(
run_id: str,
request: Request,
ctx: UserContext = Depends(require_auth),
):
"""Restart a failed/paused run from the beginning (discard checkpoint).
All progress will be lost. Use resume instead to continue from checkpoint.
"""
import database
from tasks.streaming import run_stream
await database.init_db()
pending = await database.get_pending_run(run_id)
if not pending:
raise HTTPException(404, "Run not found")
if pending['status'] not in ('failed', 'paused'):
raise HTTPException(400, f"Can only restart failed/paused runs (current status: {pending['status']})")
# Clear checkpoint data
await database.clear_run_checkpoint(run_id)
# Submit new Celery task (without resume)
task = run_stream.apply_async(
kwargs=dict(
run_id=run_id,
recipe_sexp=pending.get('dag_json', ''), # Recipe is stored in dag_json
output_name=pending.get('output_name', 'output.mp4'),
actor_id=pending.get('actor_id'),
resume=False,
),
queue='gpu',
)
# Update status and celery_task_id
await database.update_pending_run_status(run_id, 'running')
async with database.pool.acquire() as conn:
await conn.execute(
"UPDATE pending_runs SET celery_task_id = $2, updated_at = NOW() WHERE run_id = $1",
run_id, task.id
)
logger.info(f"Restarted run {run_id} from beginning with task {task.id}")
return {
"run_id": run_id,
"status": "running",
"celery_task_id": task.id,
}
@router.get("/{run_id}/stream") @router.get("/{run_id}/stream")
async def stream_run_output( async def stream_run_output(
run_id: str, run_id: str,

View File

@@ -150,6 +150,7 @@ class RunService:
"inputs": self._ensure_inputs_list(cached.get("inputs")), "inputs": self._ensure_inputs_list(cached.get("inputs")),
"output_cid": output_cid, "output_cid": output_cid,
"ipfs_cid": cached.get("ipfs_cid"), "ipfs_cid": cached.get("ipfs_cid"),
"ipfs_playlist_cid": cached.get("ipfs_playlist_cid") or (pending.get("ipfs_playlist_cid") if pending else None),
"provenance_cid": cached.get("provenance_cid"), "provenance_cid": cached.get("provenance_cid"),
"plan_cid": cached.get("plan_cid"), "plan_cid": cached.get("plan_cid"),
"actor_id": cached.get("actor_id"), "actor_id": cached.get("actor_id"),
@@ -174,6 +175,7 @@ class RunService:
status_map = { status_map = {
"pending": "pending", "pending": "pending",
"started": "running", "started": "running",
"rendering": "running", # Custom status from streaming task
"success": "completed", "success": "completed",
"failure": "failed", "failure": "failed",
"retry": "running", "retry": "running",
@@ -192,6 +194,14 @@ class RunService:
"created_at": pending.get("created_at"), "created_at": pending.get("created_at"),
"error": pending.get("error"), "error": pending.get("error"),
"recipe_sexp": pending.get("dag_json"), # Recipe content for streaming runs "recipe_sexp": pending.get("dag_json"), # Recipe content for streaming runs
# Checkpoint fields for resumable renders
"checkpoint_frame": pending.get("checkpoint_frame"),
"checkpoint_t": pending.get("checkpoint_t"),
"total_frames": pending.get("total_frames"),
"resumable": pending.get("resumable", True),
# IPFS streaming info
"ipfs_playlist_cid": pending.get("ipfs_playlist_cid"),
"quality_playlists": pending.get("quality_playlists"),
} }
# If task completed, get result # If task completed, get result
@@ -227,6 +237,14 @@ class RunService:
"created_at": pending.get("created_at"), "created_at": pending.get("created_at"),
"error": pending.get("error"), "error": pending.get("error"),
"recipe_sexp": pending.get("dag_json"), # Recipe content for streaming runs "recipe_sexp": pending.get("dag_json"), # Recipe content for streaming runs
# Checkpoint fields for resumable renders
"checkpoint_frame": pending.get("checkpoint_frame"),
"checkpoint_t": pending.get("checkpoint_t"),
"total_frames": pending.get("total_frames"),
"resumable": pending.get("resumable", True),
# IPFS streaming info
"ipfs_playlist_cid": pending.get("ipfs_playlist_cid"),
"quality_playlists": pending.get("quality_playlists"),
} }
# Fallback: Check Redis for backwards compatibility # Fallback: Check Redis for backwards compatibility
@@ -272,6 +290,7 @@ class RunService:
status_map = { status_map = {
"pending": "pending", "pending": "pending",
"started": "running", "started": "running",
"rendering": "running", # Custom status from streaming task
"success": "completed", "success": "completed",
"failure": "failed", "failure": "failed",
"retry": "running", "retry": "running",

View File

@@ -11,7 +11,7 @@
{% endblock %} {% endblock %}
{% block content %} {% block content %}
{% set status_colors = {'completed': 'green', 'running': 'blue', 'pending': 'yellow', 'failed': 'red'} %} {% set status_colors = {'completed': 'green', 'running': 'blue', 'pending': 'yellow', 'failed': 'red', 'paused': 'yellow'} %}
{% set color = status_colors.get(run.status, 'gray') %} {% set color = status_colors.get(run.status, 'gray') %}
<div class="max-w-6xl mx-auto"> <div class="max-w-6xl mx-auto">
@@ -28,7 +28,42 @@
{% if run.error %} {% if run.error %}
<span class="text-red-400 text-sm ml-2">{{ run.error }}</span> <span class="text-red-400 text-sm ml-2">{{ run.error }}</span>
{% endif %} {% endif %}
{% if run.checkpoint_frame %}
<span class="text-gray-400 text-sm ml-2">
Checkpoint: {{ run.checkpoint_frame }}{% if run.total_frames %} / {{ run.total_frames }}{% endif %} frames
</span>
{% endif %}
<div class="flex-grow"></div> <div class="flex-grow"></div>
<!-- Pause button for running renders -->
{% if run.status == 'running' %}
<button hx-post="/runs/{{ run.run_id }}/pause"
hx-target="#action-result"
hx-swap="innerHTML"
class="bg-yellow-600 hover:bg-yellow-700 px-3 py-1 rounded text-sm font-medium">
Pause
</button>
{% endif %}
<!-- Resume/Restart buttons for failed/paused renders -->
{% if run.status in ['failed', 'paused'] %}
{% if run.checkpoint_frame %}
<button hx-post="/runs/{{ run.run_id }}/resume"
hx-target="#action-result"
hx-swap="innerHTML"
class="bg-green-600 hover:bg-green-700 px-3 py-1 rounded text-sm font-medium">
Resume{% if run.total_frames %} ({{ ((run.checkpoint_frame / run.total_frames) * 100)|round|int }}%){% endif %}
</button>
{% endif %}
<button hx-post="/runs/{{ run.run_id }}/restart"
hx-target="#action-result"
hx-swap="innerHTML"
hx-confirm="Discard progress and start over?"
class="bg-yellow-600 hover:bg-yellow-700 px-3 py-1 rounded text-sm font-medium">
Restart
</button>
{% endif %}
{% if run.recipe %} {% if run.recipe %}
<button hx-post="/runs/rerun/{{ run.recipe }}" <button hx-post="/runs/rerun/{{ run.recipe }}"
hx-target="#action-result" hx-target="#action-result"
@@ -88,14 +123,20 @@
</div> </div>
</div> </div>
<!-- Unified HLS Player (shown during rendering OR for completed HLS streams) --> <!-- Unified HLS Player (shown during rendering, for paused/failed runs with checkpoint, OR for completed HLS streams) -->
{% if run.status == 'rendering' or run.ipfs_playlist_cid %} {% if run.status == 'rendering' or run.ipfs_playlist_cid or (run.status in ['paused', 'failed'] and run.checkpoint_frame) %}
<div id="hls-player-container" class="mb-6 bg-gray-800 rounded-lg p-4"> <div id="hls-player-container" class="mb-6 bg-gray-800 rounded-lg p-4">
<div class="flex items-center justify-between mb-4"> <div class="flex items-center justify-between mb-4">
<h3 class="text-lg font-semibold flex items-center"> <h3 class="text-lg font-semibold flex items-center">
{% if run.status == 'rendering' %} {% if run.status == 'rendering' %}
<span id="live-indicator" class="w-3 h-3 bg-red-500 rounded-full mr-2 animate-pulse"></span> <span id="live-indicator" class="w-3 h-3 bg-red-500 rounded-full mr-2 animate-pulse"></span>
<span id="player-title">Live Preview</span> <span id="player-title">Live Preview</span>
{% elif run.status == 'paused' %}
<span id="live-indicator" class="w-3 h-3 bg-yellow-500 rounded-full mr-2"></span>
<span id="player-title">Partial Output (Paused)</span>
{% elif run.status == 'failed' and run.checkpoint_frame %}
<span id="live-indicator" class="w-3 h-3 bg-red-500 rounded-full mr-2"></span>
<span id="player-title">Partial Output (Failed)</span>
{% else %} {% else %}
<span id="live-indicator" class="w-3 h-3 bg-green-500 rounded-full mr-2 hidden"></span> <span id="live-indicator" class="w-3 h-3 bg-green-500 rounded-full mr-2 hidden"></span>
<span id="player-title">Video</span> <span id="player-title">Video</span>
@@ -144,12 +185,15 @@
const baseUrl = '/runs/{{ run.run_id }}/playlist.m3u8'; const baseUrl = '/runs/{{ run.run_id }}/playlist.m3u8';
const isRendering = {{ 'true' if run.status == 'rendering' else 'false' }}; const isRendering = {{ 'true' if run.status == 'rendering' else 'false' }};
const isPausedOrFailed = {{ 'true' if run.status in ['paused', 'failed'] else 'false' }};
let hls = null; let hls = null;
let retryCount = 0; let retryCount = 0;
const maxRetries = 120; const maxRetries = 120;
let segmentsLoaded = 0; let segmentsLoaded = 0;
let currentMode = isRendering ? 'live' : 'replay'; // Default based on status // Start in replay mode for paused/failed (shows partial output from start)
// Start in live mode for rendering (follows the render progress)
let currentMode = isRendering ? 'live' : 'replay';
function getHlsUrl() { function getHlsUrl() {
return baseUrl + '?_t=' + Date.now(); return baseUrl + '?_t=' + Date.now();

View File

@@ -111,6 +111,27 @@ BEGIN
WHERE table_name = 'pending_runs' AND column_name = 'quality_playlists') THEN WHERE table_name = 'pending_runs' AND column_name = 'quality_playlists') THEN
ALTER TABLE pending_runs ADD COLUMN quality_playlists JSONB; ALTER TABLE pending_runs ADD COLUMN quality_playlists JSONB;
END IF; END IF;
-- Checkpoint columns for resumable renders
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
WHERE table_name = 'pending_runs' AND column_name = 'checkpoint_frame') THEN
ALTER TABLE pending_runs ADD COLUMN checkpoint_frame INTEGER;
END IF;
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
WHERE table_name = 'pending_runs' AND column_name = 'checkpoint_t') THEN
ALTER TABLE pending_runs ADD COLUMN checkpoint_t FLOAT;
END IF;
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
WHERE table_name = 'pending_runs' AND column_name = 'checkpoint_scans') THEN
ALTER TABLE pending_runs ADD COLUMN checkpoint_scans JSONB;
END IF;
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
WHERE table_name = 'pending_runs' AND column_name = 'total_frames') THEN
ALTER TABLE pending_runs ADD COLUMN total_frames INTEGER;
END IF;
IF NOT EXISTS (SELECT 1 FROM information_schema.columns
WHERE table_name = 'pending_runs' AND column_name = 'resumable') THEN
ALTER TABLE pending_runs ADD COLUMN resumable BOOLEAN DEFAULT TRUE;
END IF;
END $$; END $$;
CREATE INDEX IF NOT EXISTS idx_pending_runs_status ON pending_runs(status); CREATE INDEX IF NOT EXISTS idx_pending_runs_status ON pending_runs(status);
@@ -1530,7 +1551,9 @@ async def get_pending_run(run_id: str) -> Optional[dict]:
async with pool.acquire() as conn: async with pool.acquire() as conn:
row = await conn.fetchrow( row = await conn.fetchrow(
""" """
SELECT run_id, celery_task_id, status, recipe, inputs, dag_json, plan_cid, output_name, actor_id, error, ipfs_playlist_cid, quality_playlists, created_at, updated_at SELECT run_id, celery_task_id, status, recipe, inputs, dag_json, plan_cid, output_name, actor_id, error,
ipfs_playlist_cid, quality_playlists, checkpoint_frame, checkpoint_t, checkpoint_scans,
total_frames, resumable, created_at, updated_at
FROM pending_runs WHERE run_id = $1 FROM pending_runs WHERE run_id = $1
""", """,
run_id run_id
@@ -1544,6 +1567,10 @@ async def get_pending_run(run_id: str) -> Optional[dict]:
quality_playlists = row.get("quality_playlists") quality_playlists = row.get("quality_playlists")
if isinstance(quality_playlists, str): if isinstance(quality_playlists, str):
quality_playlists = _json.loads(quality_playlists) quality_playlists = _json.loads(quality_playlists)
# Parse checkpoint_scans if it's a string
checkpoint_scans = row.get("checkpoint_scans")
if isinstance(checkpoint_scans, str):
checkpoint_scans = _json.loads(checkpoint_scans)
return { return {
"run_id": row["run_id"], "run_id": row["run_id"],
"celery_task_id": row["celery_task_id"], "celery_task_id": row["celery_task_id"],
@@ -1557,6 +1584,11 @@ async def get_pending_run(run_id: str) -> Optional[dict]:
"error": row["error"], "error": row["error"],
"ipfs_playlist_cid": row["ipfs_playlist_cid"], "ipfs_playlist_cid": row["ipfs_playlist_cid"],
"quality_playlists": quality_playlists, "quality_playlists": quality_playlists,
"checkpoint_frame": row.get("checkpoint_frame"),
"checkpoint_t": row.get("checkpoint_t"),
"checkpoint_scans": checkpoint_scans,
"total_frames": row.get("total_frames"),
"resumable": row.get("resumable", True),
"created_at": row["created_at"].isoformat() if row["created_at"] else None, "created_at": row["created_at"].isoformat() if row["created_at"] else None,
"updated_at": row["updated_at"].isoformat() if row["updated_at"] else None, "updated_at": row["updated_at"].isoformat() if row["updated_at"] else None,
} }
@@ -1666,6 +1698,109 @@ async def update_pending_run_playlist(run_id: str, ipfs_playlist_cid: str, quali
return "UPDATE 1" in result return "UPDATE 1" in result
async def update_pending_run_checkpoint(
run_id: str,
checkpoint_frame: int,
checkpoint_t: float,
checkpoint_scans: Optional[dict] = None,
total_frames: Optional[int] = None,
) -> bool:
"""Update checkpoint state for a streaming run.
Called at segment boundaries to enable resume after failures.
Args:
run_id: The run ID
checkpoint_frame: Last completed frame at segment boundary
checkpoint_t: Time value for checkpoint frame
checkpoint_scans: Accumulated scan state {scan_name: state_dict}
total_frames: Total expected frames (for progress %)
"""
if pool is None:
raise RuntimeError("Database pool not initialized - call init_db() first")
async with pool.acquire() as conn:
result = await conn.execute(
"""
UPDATE pending_runs SET
checkpoint_frame = $2,
checkpoint_t = $3,
checkpoint_scans = $4,
total_frames = COALESCE($5, total_frames),
updated_at = NOW()
WHERE run_id = $1
""",
run_id,
checkpoint_frame,
checkpoint_t,
_json.dumps(checkpoint_scans) if checkpoint_scans else None,
total_frames,
)
return "UPDATE 1" in result
async def get_run_checkpoint(run_id: str) -> Optional[dict]:
"""Get checkpoint data for resuming a run.
Returns:
Dict with checkpoint_frame, checkpoint_t, checkpoint_scans, quality_playlists, etc.
or None if no checkpoint exists
"""
if pool is None:
raise RuntimeError("Database pool not initialized - call init_db() first")
async with pool.acquire() as conn:
row = await conn.fetchrow(
"""
SELECT checkpoint_frame, checkpoint_t, checkpoint_scans, total_frames,
quality_playlists, ipfs_playlist_cid, resumable
FROM pending_runs WHERE run_id = $1
""",
run_id
)
if row and row.get("checkpoint_frame") is not None:
# Parse JSONB fields
checkpoint_scans = row.get("checkpoint_scans")
if isinstance(checkpoint_scans, str):
checkpoint_scans = _json.loads(checkpoint_scans)
quality_playlists = row.get("quality_playlists")
if isinstance(quality_playlists, str):
quality_playlists = _json.loads(quality_playlists)
return {
"frame_num": row["checkpoint_frame"],
"t": row["checkpoint_t"],
"scans": checkpoint_scans or {},
"total_frames": row.get("total_frames"),
"quality_playlists": quality_playlists,
"ipfs_playlist_cid": row.get("ipfs_playlist_cid"),
"resumable": row.get("resumable", True),
}
return None
async def clear_run_checkpoint(run_id: str) -> bool:
"""Clear checkpoint data for a run (used on restart).
Args:
run_id: The run ID
"""
if pool is None:
raise RuntimeError("Database pool not initialized - call init_db() first")
async with pool.acquire() as conn:
result = await conn.execute(
"""
UPDATE pending_runs SET
checkpoint_frame = NULL,
checkpoint_t = NULL,
checkpoint_scans = NULL,
quality_playlists = NULL,
ipfs_playlist_cid = NULL,
updated_at = NOW()
WHERE run_id = $1
""",
run_id,
)
return "UPDATE 1" in result
async def complete_pending_run(run_id: str) -> bool: async def complete_pending_run(run_id: str) -> bool:
"""Remove a pending run after it completes (moves to run_cache).""" """Remove a pending run after it completes (moves to run_cache)."""
async with pool.acquire() as conn: async with pool.acquire() as conn:

View File

@@ -7,13 +7,13 @@ echo "=== Pulling latest code ==="
git pull git pull
echo "=== Building Docker image ===" echo "=== Building Docker image ==="
docker build --build-arg CACHEBUST=$(date +%s) -t git.rose-ash.com/art-dag/l1-server:latest . docker build --build-arg CACHEBUST=$(date +%s) -t registry.rose-ash.com:5000/celery-l1-server:latest .
echo "=== Pushing to registry ==="
docker push registry.rose-ash.com:5000/celery-l1-server:latest
echo "=== Redeploying celery stack ===" echo "=== Redeploying celery stack ==="
docker stack deploy -c docker-compose.yml celery docker stack deploy -c docker-compose.yml celery
echo "=== Restarting proxy nginx ==="
docker service update --force proxy_nginx
echo "=== Done ===" echo "=== Done ==="
docker stack services celery docker stack services celery

249
diagnose_gpu.py Executable file
View File

@@ -0,0 +1,249 @@
#!/usr/bin/env python3
"""
GPU Rendering Diagnostic Script
Checks for common issues that cause GPU rendering slowdowns in art-dag.
Run this script to identify potential performance bottlenecks.
"""
import sys
import subprocess
import os
def print_section(title):
print(f"\n{'='*60}")
print(f" {title}")
print(f"{'='*60}")
def check_pass(msg):
print(f" [PASS] {msg}")
def check_fail(msg):
print(f" [FAIL] {msg}")
def check_warn(msg):
print(f" [WARN] {msg}")
def check_info(msg):
print(f" [INFO] {msg}")
# ============================================================
# 1. Check GPU Availability
# ============================================================
print_section("1. GPU AVAILABILITY")
# Check nvidia-smi
try:
result = subprocess.run(["nvidia-smi", "--query-gpu=name,memory.total,memory.free,utilization.gpu",
"--format=csv,noheader"], capture_output=True, text=True, timeout=5)
if result.returncode == 0:
for line in result.stdout.strip().split('\n'):
check_pass(f"GPU found: {line}")
else:
check_fail("nvidia-smi failed - no GPU detected")
except FileNotFoundError:
check_fail("nvidia-smi not found - NVIDIA drivers not installed")
except Exception as e:
check_fail(f"nvidia-smi error: {e}")
# ============================================================
# 2. Check CuPy
# ============================================================
print_section("2. CUPY (GPU ARRAY LIBRARY)")
try:
import cupy as cp
check_pass(f"CuPy available, version {cp.__version__}")
# Test basic GPU operation
try:
a = cp.zeros((100, 100), dtype=cp.uint8)
cp.cuda.Stream.null.synchronize()
check_pass("CuPy GPU operations working")
# Check memory
mempool = cp.get_default_memory_pool()
check_info(f"GPU memory pool: {mempool.used_bytes() / 1024**2:.1f} MB used, "
f"{mempool.total_bytes() / 1024**2:.1f} MB total")
except Exception as e:
check_fail(f"CuPy GPU test failed: {e}")
except ImportError:
check_fail("CuPy not installed - GPU rendering disabled")
# ============================================================
# 3. Check PyNvVideoCodec (GPU Encoding)
# ============================================================
print_section("3. PYNVVIDEOCODEC (GPU ENCODING)")
try:
import PyNvVideoCodec as nvc
check_pass("PyNvVideoCodec available - zero-copy GPU encoding enabled")
except ImportError:
check_warn("PyNvVideoCodec not available - using FFmpeg NVENC (slower)")
# ============================================================
# 4. Check Decord GPU (Hardware Decode)
# ============================================================
print_section("4. DECORD GPU (HARDWARE DECODE)")
try:
import decord
from decord import gpu
ctx = gpu(0)
check_pass(f"Decord GPU (NVDEC) available - hardware video decode enabled")
except ImportError:
check_warn("Decord not installed - using FFmpeg decode")
except Exception as e:
check_warn(f"Decord GPU not available ({e}) - using FFmpeg decode")
# ============================================================
# 5. Check DLPack Support
# ============================================================
print_section("5. DLPACK (ZERO-COPY TRANSFER)")
try:
import decord
from decord import VideoReader, gpu
import cupy as cp
# Need a test video file
test_video = None
for path in ["/data/cache", "/tmp"]:
if os.path.exists(path):
for f in os.listdir(path):
if f.endswith(('.mp4', '.webm', '.mkv')):
test_video = os.path.join(path, f)
break
if test_video:
break
if test_video:
try:
vr = VideoReader(test_video, ctx=gpu(0))
frame = vr[0]
dlpack = frame.to_dlpack()
gpu_frame = cp.from_dlpack(dlpack)
check_pass(f"DLPack zero-copy working (tested with {os.path.basename(test_video)})")
except Exception as e:
check_fail(f"DLPack FAILED: {e}")
check_info("This means every frame does GPU->CPU->GPU copy (SLOW)")
else:
check_warn("No test video found - cannot verify DLPack")
except ImportError:
check_warn("Cannot test DLPack - decord or cupy not available")
# ============================================================
# 6. Check Fast CUDA Kernels
# ============================================================
print_section("6. FAST CUDA KERNELS (JIT COMPILED)")
try:
sys.path.insert(0, '/root/art-dag/celery')
from streaming.jit_compiler import (
fast_rotate, fast_zoom, fast_blend, fast_hue_shift,
fast_invert, fast_ripple, get_fast_ops
)
check_pass("Fast CUDA kernels loaded successfully")
# Test one kernel
try:
import cupy as cp
test_img = cp.zeros((720, 1280, 3), dtype=cp.uint8)
result = fast_rotate(test_img, 45.0)
cp.cuda.Stream.null.synchronize()
check_pass("Fast rotate kernel working")
except Exception as e:
check_fail(f"Fast kernel execution failed: {e}")
except ImportError as e:
check_warn(f"Fast CUDA kernels not available: {e}")
check_info("Fallback to slower CuPy operations")
# ============================================================
# 7. Check Fused Pipeline Compiler
# ============================================================
print_section("7. FUSED PIPELINE COMPILER")
try:
sys.path.insert(0, '/root/art-dag/celery')
from streaming.sexp_to_cuda import compile_frame_pipeline, compile_autonomous_pipeline
check_pass("Fused CUDA pipeline compiler available")
except ImportError as e:
check_warn(f"Fused pipeline compiler not available: {e}")
check_info("Using per-operation fallback (slower for multi-effect pipelines)")
# ============================================================
# 8. Check FFmpeg NVENC
# ============================================================
print_section("8. FFMPEG NVENC (HARDWARE ENCODE)")
try:
result = subprocess.run(["ffmpeg", "-encoders"], capture_output=True, text=True, timeout=5)
if "h264_nvenc" in result.stdout:
check_pass("FFmpeg h264_nvenc encoder available")
else:
check_warn("FFmpeg h264_nvenc not available - using libx264 (CPU)")
if "hevc_nvenc" in result.stdout:
check_pass("FFmpeg hevc_nvenc encoder available")
except Exception as e:
check_fail(f"FFmpeg check failed: {e}")
# ============================================================
# 9. Check FFmpeg NVDEC
# ============================================================
print_section("9. FFMPEG NVDEC (HARDWARE DECODE)")
try:
result = subprocess.run(["ffmpeg", "-hwaccels"], capture_output=True, text=True, timeout=5)
if "cuda" in result.stdout:
check_pass("FFmpeg CUDA hwaccel available")
else:
check_warn("FFmpeg CUDA hwaccel not available - using CPU decode")
except Exception as e:
check_fail(f"FFmpeg hwaccel check failed: {e}")
# ============================================================
# 10. Check Pipeline Cache Status
# ============================================================
print_section("10. PIPELINE CACHE STATUS")
try:
sys.path.insert(0, '/root/art-dag/celery')
from sexp_effects.primitive_libs.streaming_gpu import (
_FUSED_PIPELINE_CACHE, _AUTONOMOUS_PIPELINE_CACHE
)
fused_count = len(_FUSED_PIPELINE_CACHE)
auto_count = len(_AUTONOMOUS_PIPELINE_CACHE)
if fused_count > 0 or auto_count > 0:
check_info(f"Fused pipeline cache: {fused_count} entries")
check_info(f"Autonomous pipeline cache: {auto_count} entries")
if fused_count > 100 or auto_count > 100:
check_warn("Large pipeline cache - may cause memory pressure")
else:
check_info("Pipeline caches empty (no rendering done yet)")
except Exception as e:
check_info(f"Could not check pipeline cache: {e}")
# ============================================================
# Summary
# ============================================================
print_section("SUMMARY")
print("""
Optimal GPU rendering requires:
1. [CRITICAL] CuPy with working GPU operations
2. [CRITICAL] DLPack zero-copy transfer (decord -> CuPy)
3. [HIGH] Fast CUDA kernels from jit_compiler
4. [MEDIUM] Fused pipeline compiler for multi-effect recipes
5. [MEDIUM] PyNvVideoCodec for zero-copy encoding
6. [LOW] FFmpeg NVENC/NVDEC as fallback
If DLPack is failing, check:
- decord version (needs 0.6.0+ with DLPack support)
- CuPy version compatibility
- CUDA toolkit version match
If fast kernels are not loading:
- Check if streaming/jit_compiler.py exists
- Verify CUDA compiler (nvcc) is available
""")

View File

@@ -64,7 +64,7 @@ services:
- node.labels.gpu != true - node.labels.gpu != true
l1-server: l1-server:
image: git.rose-ash.com/art-dag/l1-server:latest image: registry.rose-ash.com:5000/celery-l1-server:latest
env_file: env_file:
- .env - .env
environment: environment:
@@ -100,7 +100,7 @@ services:
- node.labels.gpu != true - node.labels.gpu != true
l1-worker: l1-worker:
image: git.rose-ash.com/art-dag/l1-server:latest image: registry.rose-ash.com:5000/celery-l1-server:latest
command: sh -c "find /app -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null; celery -A celery_app worker --loglevel=info -E" command: sh -c "find /app -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null; celery -A celery_app worker --loglevel=info -E"
environment: environment:
- REDIS_URL=redis://redis:6379/5 - REDIS_URL=redis://redis:6379/5
@@ -149,10 +149,10 @@ services:
- node.labels.gpu != true - node.labels.gpu != true
# GPU worker for streaming/rendering tasks # GPU worker for streaming/rendering tasks
# Build: docker build -f Dockerfile.gpu -t git.rose-ash.com/art-dag/l1-gpu-server:latest . # Build: docker build -f Dockerfile.gpu -t registry.rose-ash.com:5000/celery-l1-gpu-server:latest .
# Requires: docker node update --label-add gpu=true <gpu-node-name> # Requires: docker node update --label-add gpu=true <gpu-node-name>
l1-gpu-worker: l1-gpu-worker:
image: git.rose-ash.com/art-dag/l1-gpu-server:latest image: registry.rose-ash.com:5000/celery-l1-gpu-server:latest
# For local dev, uncomment to build from Dockerfile.gpu: # For local dev, uncomment to build from Dockerfile.gpu:
# build: # build:
# context: . # context: .

View File

@@ -59,7 +59,21 @@ class MultiResolutionHLSOutput:
ipfs_gateway: str = "https://ipfs.io/ipfs", ipfs_gateway: str = "https://ipfs.io/ipfs",
on_playlist_update: callable = None, on_playlist_update: callable = None,
audio_source: str = None, audio_source: str = None,
resume_from: Optional[Dict] = None,
): ):
"""Initialize multi-resolution HLS output.
Args:
output_dir: Directory for HLS output files
source_size: (width, height) of source frames
fps: Frames per second
segment_duration: Duration of each HLS segment in seconds
ipfs_gateway: IPFS gateway URL for playlist URLs
on_playlist_update: Callback when playlists are updated
audio_source: Optional audio file to mux with video
resume_from: Optional dict to resume from checkpoint with keys:
- segment_cids: Dict of quality -> {seg_num: cid}
"""
self.output_dir = Path(output_dir) self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True) self.output_dir.mkdir(parents=True, exist_ok=True)
self.source_width, self.source_height = source_size self.source_width, self.source_height = source_size
@@ -75,6 +89,13 @@ class MultiResolutionHLSOutput:
self.qualities: Dict[str, QualityLevel] = {} self.qualities: Dict[str, QualityLevel] = {}
self._setup_quality_levels() self._setup_quality_levels()
# Restore segment CIDs if resuming (don't re-upload existing segments)
if resume_from and resume_from.get('segment_cids'):
for name, cids in resume_from['segment_cids'].items():
if name in self.qualities:
self.qualities[name].segment_cids = dict(cids)
print(f"[MultiResHLS] Restored {len(cids)} segment CIDs for {name}", file=sys.stderr)
# IPFS client # IPFS client
from ipfs_client import add_file, add_bytes from ipfs_client import add_file, add_bytes
self._ipfs_add_file = add_file self._ipfs_add_file = add_file

View File

@@ -128,6 +128,14 @@ class StreamInterpreter:
# Signature: on_progress(percent: float, frame_num: int, total_frames: int) # Signature: on_progress(percent: float, frame_num: int, total_frames: int)
self.on_progress: callable = None self.on_progress: callable = None
# Callback for checkpoint saves (called at segment boundaries for resumability)
# Signature: on_checkpoint(checkpoint: dict)
# checkpoint contains: frame_num, t, scans
self.on_checkpoint: callable = None
# Frames per segment for checkpoint timing (default 4 seconds at 30fps = 120 frames)
self._frames_per_segment: int = 120
def _resolve_name(self, name: str) -> Optional[Path]: def _resolve_name(self, name: str) -> Optional[Path]:
"""Resolve a friendly name to a file path using the naming service.""" """Resolve a friendly name to a file path using the naming service."""
try: try:
@@ -989,8 +997,40 @@ class StreamInterpreter:
else: else:
scan['state'] = {'acc': new_state} scan['state'] = {'acc': new_state}
def run(self, duration: float = None, output: str = "pipe"): def _restore_checkpoint(self, checkpoint: dict):
"""Run the streaming pipeline.""" """Restore scan states from a checkpoint.
Called when resuming a render from a previous checkpoint.
Args:
checkpoint: Dict with 'scans' key containing {scan_name: state_dict}
"""
scans_state = checkpoint.get('scans', {})
for name, state in scans_state.items():
if name in self.scans:
self.scans[name]['state'] = dict(state)
print(f"Restored scan '{name}' state from checkpoint", file=sys.stderr)
def _get_checkpoint_state(self) -> dict:
"""Get current scan states for checkpointing.
Returns:
Dict mapping scan names to their current state dicts
"""
return {name: dict(scan['state']) for name, scan in self.scans.items()}
def run(self, duration: float = None, output: str = "pipe", resume_from: dict = None):
"""Run the streaming pipeline.
Args:
duration: Duration in seconds (auto-detected from audio if None)
output: Output mode ("pipe", "preview", path/hls, path/ipfs-hls, or file path)
resume_from: Checkpoint dict to resume from, with keys:
- frame_num: Last completed frame
- t: Time value for checkpoint frame
- scans: Dict of scan states to restore
- segment_cids: Dict of quality -> {seg_num: cid} for output resume
"""
# Import output classes - handle both package and direct execution # Import output classes - handle both package and direct execution
try: try:
from .output import PipeOutput, DisplayOutput, FileOutput, HLSOutput, IPFSHLSOutput from .output import PipeOutput, DisplayOutput, FileOutput, HLSOutput, IPFSHLSOutput
@@ -1010,6 +1050,11 @@ class StreamInterpreter:
self._init() self._init()
# Restore checkpoint state if resuming
if resume_from:
self._restore_checkpoint(resume_from)
print(f"Resuming from frame {resume_from.get('frame_num', 0)}", file=sys.stderr)
if not self.frame_pipeline: if not self.frame_pipeline:
print("Error: no (frame ...) pipeline defined", file=sys.stderr) print("Error: no (frame ...) pipeline defined", file=sys.stderr)
return return
@@ -1061,6 +1106,12 @@ class StreamInterpreter:
hls_dir = output[:-9] # Remove /ipfs-hls suffix hls_dir = output[:-9] # Remove /ipfs-hls suffix
import os import os
ipfs_gateway = os.environ.get("IPFS_GATEWAY_URL", "https://ipfs.io/ipfs") ipfs_gateway = os.environ.get("IPFS_GATEWAY_URL", "https://ipfs.io/ipfs")
# Build resume state for output if resuming
output_resume = None
if resume_from and resume_from.get('segment_cids'):
output_resume = {'segment_cids': resume_from['segment_cids']}
# Use multi-resolution output (renders original + 720p + 360p) # Use multi-resolution output (renders original + 720p + 360p)
if MultiResolutionHLSOutput is not None: if MultiResolutionHLSOutput is not None:
print(f"[StreamInterpreter] Using multi-resolution HLS output ({w}x{h} + 720p + 360p)", file=sys.stderr) print(f"[StreamInterpreter] Using multi-resolution HLS output ({w}x{h} + 720p + 360p)", file=sys.stderr)
@@ -1071,6 +1122,7 @@ class StreamInterpreter:
ipfs_gateway=ipfs_gateway, ipfs_gateway=ipfs_gateway,
on_playlist_update=self.on_playlist_update, on_playlist_update=self.on_playlist_update,
audio_source=audio, audio_source=audio,
resume_from=output_resume,
) )
# Fallback to GPU single-resolution if multi-res not available # Fallback to GPU single-resolution if multi-res not available
elif GPUHLSOutput is not None and check_gpu_encode_available(): elif GPUHLSOutput is not None and check_gpu_encode_available():
@@ -1083,6 +1135,16 @@ class StreamInterpreter:
else: else:
out = FileOutput(output, size=(w, h), fps=fps, audio_source=audio) out = FileOutput(output, size=(w, h), fps=fps, audio_source=audio)
# Calculate frames per segment based on fps and segment duration (4 seconds default)
segment_duration = 4.0
self._frames_per_segment = int(fps * segment_duration)
# Determine start frame (resume from checkpoint + 1, or 0)
start_frame = 0
if resume_from and resume_from.get('frame_num') is not None:
start_frame = resume_from['frame_num'] + 1
print(f"Starting from frame {start_frame} (checkpoint was at {resume_from['frame_num']})", file=sys.stderr)
try: try:
frame_times = [] frame_times = []
profile_interval = 30 # Profile every N frames profile_interval = 30 # Profile every N frames
@@ -1090,7 +1152,7 @@ class StreamInterpreter:
eval_times = [] eval_times = []
write_times = [] write_times = []
for frame_num in range(n_frames): for frame_num in range(start_frame, n_frames):
if not out.is_open: if not out.is_open:
break break
@@ -1127,6 +1189,19 @@ class StreamInterpreter:
frame_elapsed = time.time() - frame_start frame_elapsed = time.time() - frame_start
frame_times.append(frame_elapsed) frame_times.append(frame_elapsed)
# Checkpoint at segment boundaries (every _frames_per_segment frames)
if frame_num > 0 and frame_num % self._frames_per_segment == 0:
if self.on_checkpoint:
try:
checkpoint = {
'frame_num': frame_num,
't': ctx.t,
'scans': self._get_checkpoint_state(),
}
self.on_checkpoint(checkpoint)
except Exception as e:
print(f"Warning: checkpoint callback failed: {e}", file=sys.stderr)
# Progress with timing and profile breakdown # Progress with timing and profile breakdown
if frame_num % profile_interval == 0 and frame_num > 0: if frame_num % profile_interval == 0 and frame_num > 0:
pct = 100 * frame_num / n_frames pct = 100 * frame_num / n_frames

View File

@@ -3,11 +3,13 @@ Streaming video rendering task.
Executes S-expression recipes for frame-by-frame video processing. Executes S-expression recipes for frame-by-frame video processing.
Supports CID and friendly name references for assets. Supports CID and friendly name references for assets.
Supports pause/resume/restart for long renders.
""" """
import hashlib import hashlib
import logging import logging
import os import os
import signal
import sys import sys
import tempfile import tempfile
from pathlib import Path from pathlib import Path
@@ -23,6 +25,11 @@ from cache_manager import get_cache_manager
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class PauseRequested(Exception):
"""Raised when user requests pause via SIGTERM."""
pass
# Debug: verify module is being loaded # Debug: verify module is being loaded
print(f"DEBUG MODULE LOAD: tasks/streaming.py loaded at {__file__}", file=sys.stderr) print(f"DEBUG MODULE LOAD: tasks/streaming.py loaded at {__file__}", file=sys.stderr)
@@ -246,6 +253,7 @@ def run_stream(
actor_id: Optional[str] = None, actor_id: Optional[str] = None,
sources_sexp: Optional[str] = None, sources_sexp: Optional[str] = None,
audio_sexp: Optional[str] = None, audio_sexp: Optional[str] = None,
resume: bool = False,
) -> dict: ) -> dict:
""" """
Execute a streaming S-expression recipe. Execute a streaming S-expression recipe.
@@ -259,13 +267,25 @@ def run_stream(
actor_id: User ID for friendly name resolution actor_id: User ID for friendly name resolution
sources_sexp: Optional sources config S-expression sources_sexp: Optional sources config S-expression
audio_sexp: Optional audio config S-expression audio_sexp: Optional audio config S-expression
resume: If True, load checkpoint and resume from where we left off
Returns: Returns:
Dict with output_cid, output_path, and status Dict with output_cid, output_path, and status
""" """
global _resolve_loop, _db_initialized global _resolve_loop, _db_initialized
task_id = self.request.id task_id = self.request.id
logger.info(f"Starting stream task {task_id} for run {run_id}") logger.info(f"Starting stream task {task_id} for run {run_id} (resume={resume})")
# Handle graceful pause (SIGTERM from Celery revoke)
pause_requested = False
original_sigterm = signal.getsignal(signal.SIGTERM)
def handle_sigterm(signum, frame):
nonlocal pause_requested
pause_requested = True
logger.info(f"Pause requested for run {run_id} (SIGTERM received)")
signal.signal(signal.SIGTERM, handle_sigterm)
self.update_state(state='INITIALIZING', meta={'progress': 0}) self.update_state(state='INITIALIZING', meta={'progress': 0})
@@ -311,6 +331,28 @@ def run_stream(
# Import the streaming interpreter # Import the streaming interpreter
from streaming.stream_sexp_generic import StreamInterpreter from streaming.stream_sexp_generic import StreamInterpreter
# Load checkpoint if resuming
checkpoint = None
if resume:
import asyncio
import database
try:
if _resolve_loop is None or _resolve_loop.is_closed():
_resolve_loop = asyncio.new_event_loop()
asyncio.set_event_loop(_resolve_loop)
_db_initialized = False
if not _db_initialized:
_resolve_loop.run_until_complete(database.init_db())
_db_initialized = True
checkpoint = _resolve_loop.run_until_complete(database.get_run_checkpoint(run_id))
if checkpoint:
logger.info(f"Loaded checkpoint for run {run_id}: frame {checkpoint.get('frame_num')}")
else:
logger.warning(f"No checkpoint found for run {run_id}, starting from beginning")
except Exception as e:
logger.error(f"Failed to load checkpoint: {e}")
checkpoint = None
# Create interpreter (pass actor_id for friendly name resolution) # Create interpreter (pass actor_id for friendly name resolution)
interp = StreamInterpreter(str(recipe_path), actor_id=actor_id) interp = StreamInterpreter(str(recipe_path), actor_id=actor_id)
@@ -361,6 +403,7 @@ def run_stream(
# Set up progress callback to update Celery task state # Set up progress callback to update Celery task state
def on_progress(pct, frame_num, total_frames): def on_progress(pct, frame_num, total_frames):
nonlocal pause_requested
# Scale progress: 5% (start) to 85% (before caching) # Scale progress: 5% (start) to 85% (before caching)
scaled_progress = 5 + (pct * 0.8) # 5% to 85% scaled_progress = 5 + (pct * 0.8) # 5% to 85%
self.update_state(state='RENDERING', meta={ self.update_state(state='RENDERING', meta={
@@ -372,9 +415,94 @@ def run_stream(
interp.on_progress = on_progress interp.on_progress = on_progress
# Set up checkpoint callback to save state at segment boundaries
def on_checkpoint(ckpt):
"""Save checkpoint state to database."""
nonlocal pause_requested
global _resolve_loop, _db_initialized
import asyncio
import database
try:
if _resolve_loop is None or _resolve_loop.is_closed():
_resolve_loop = asyncio.new_event_loop()
asyncio.set_event_loop(_resolve_loop)
_db_initialized = False
if not _db_initialized:
_resolve_loop.run_until_complete(database.init_db())
_db_initialized = True
# Get total frames from interpreter config
total_frames = None
if hasattr(interp, 'output') and hasattr(interp.output, '_frame_count'):
# Estimate total frames based on duration
fps_val = interp.config.get('fps', 30)
for name, val in interp.globals.items():
if hasattr(val, 'duration'):
total_frames = int(val.duration * fps_val)
break
_resolve_loop.run_until_complete(database.update_pending_run_checkpoint(
run_id=run_id,
checkpoint_frame=ckpt['frame_num'],
checkpoint_t=ckpt['t'],
checkpoint_scans=ckpt.get('scans'),
total_frames=total_frames,
))
logger.info(f"Saved checkpoint for run {run_id}: frame {ckpt['frame_num']}")
# Check if pause was requested after checkpoint
if pause_requested:
logger.info(f"Pause requested after checkpoint, raising PauseRequested")
raise PauseRequested("Render paused by user")
except PauseRequested:
raise # Re-raise to stop the render
except Exception as e:
logger.error(f"Failed to save checkpoint: {e}")
interp.on_checkpoint = on_checkpoint
# Build resume state for the interpreter (includes segment CIDs for output)
resume_from = None
if checkpoint:
resume_from = {
'frame_num': checkpoint.get('frame_num'),
't': checkpoint.get('t'),
'scans': checkpoint.get('scans', {}),
}
# Add segment CIDs if available (from quality_playlists in checkpoint)
# Note: We need to extract segment_cids from the output's state, which isn't
# directly stored. For now, the output will re-check existing segments on disk.
# Run rendering to file # Run rendering to file
logger.info(f"Rendering to {output_path}") logger.info(f"Rendering to {output_path}" + (f" (resuming from frame {resume_from['frame_num']})" if resume_from else ""))
interp.run(duration=duration, output=str(output_path)) render_paused = False
try:
interp.run(duration=duration, output=str(output_path), resume_from=resume_from)
except PauseRequested:
# Graceful pause - checkpoint already saved
render_paused = True
logger.info(f"Render paused for run {run_id}")
# Restore original signal handler
signal.signal(signal.SIGTERM, original_sigterm)
if render_paused:
import asyncio
import database
try:
if _resolve_loop is None or _resolve_loop.is_closed():
_resolve_loop = asyncio.new_event_loop()
asyncio.set_event_loop(_resolve_loop)
_db_initialized = False
if not _db_initialized:
_resolve_loop.run_until_complete(database.init_db())
_db_initialized = True
_resolve_loop.run_until_complete(database.update_pending_run_status(run_id, 'paused'))
except Exception as e:
logger.error(f"Failed to update status to paused: {e}")
return {"status": "paused", "run_id": run_id, "task_id": task_id}
# Check for interpreter errors # Check for interpreter errors
if interp.errors: if interp.errors:
@@ -395,7 +523,6 @@ def run_stream(
# Update pending run with playlist CID for live HLS redirect # Update pending run with playlist CID for live HLS redirect
if ipfs_playlist_cid: if ipfs_playlist_cid:
global _resolve_loop, _db_initialized
import asyncio import asyncio
import database import database
try: try: