Files
rose-ash/core/artdag/sexp/planner.py
2026-02-24 23:09:39 +00:00

2188 lines
78 KiB
Python

"""
Execution plan generation from S-expression recipes.
The planner:
1. Takes a compiled recipe + input content hashes
2. Runs analyzers to get concrete data (beat times, etc.)
3. Expands dynamic nodes (SLICE_ON) into primitive operations
4. Resolves all registry references to content hashes
5. Generates an execution plan with pre-computed cache IDs
Plans are S-expressions with all references resolved to hashes,
ready for distribution to Celery workers.
"""
import hashlib
import importlib.util
import json
import sys
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Callable
from .parser import Symbol, Keyword, Binding, serialize
from .compiler import CompiledRecipe
# Node types that can be collapsed into a single FFmpeg filter chain
COLLAPSIBLE_TYPES = {"EFFECT", "SEGMENT"}
# Node types that are boundaries (sources, merges, or special processing)
BOUNDARY_TYPES = {"SOURCE", "SEQUENCE", "MUX", "ANALYZE", "SCAN", "LIST"}
# Node types that need expansion during planning
EXPANDABLE_TYPES = {"SLICE_ON", "CONSTRUCT"}
def _load_module(module_path: Path, module_name: str = "module"):
"""Load a Python module from file path."""
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
def _run_analyzer(
analyzer_path: Path,
input_path: Path,
params: Dict[str, Any],
) -> Dict[str, Any]:
"""Run an analyzer module and return results."""
analyzer = _load_module(analyzer_path, "analyzer")
return analyzer.analyze(input_path, params)
def _pre_execute_segment(
node: Dict,
input_path: Path,
work_dir: Path,
) -> Path:
"""
Pre-execute a SEGMENT node during planning.
This is needed when ANALYZE depends on a SEGMENT output.
Returns path to the segmented file.
"""
import subprocess
import tempfile
config = node.get("config", {})
start = config.get("start", 0)
duration = config.get("duration")
end = config.get("end")
# Detect if input is audio-only
suffix = input_path.suffix.lower()
is_audio = suffix in ('.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a')
if is_audio:
output_ext = ".m4a" # Use m4a for aac codec
else:
output_ext = ".mp4"
output_path = work_dir / f"segment_{node['id'][:16]}{output_ext}"
cmd = ["ffmpeg", "-y", "-i", str(input_path)]
if start:
cmd.extend(["-ss", str(start)])
if duration:
cmd.extend(["-t", str(duration)])
elif end:
cmd.extend(["-t", str(end - start)])
if is_audio:
cmd.extend(["-c:a", "aac", str(output_path)])
else:
cmd.extend(["-c:v", "libx264", "-preset", "fast", "-crf", "18",
"-c:a", "aac", str(output_path)])
subprocess.run(cmd, check=True, capture_output=True)
return output_path
def _serialize_for_hash(obj) -> str:
"""Serialize any value to canonical S-expression string for hashing."""
from .parser import Lambda
if obj is None:
return "nil"
if isinstance(obj, bool):
return "true" if obj else "false"
if isinstance(obj, (int, float)):
return str(obj)
if isinstance(obj, str):
escaped = obj.replace('\\', '\\\\').replace('"', '\\"')
return f'"{escaped}"'
if isinstance(obj, Symbol):
return obj.name
if isinstance(obj, Keyword):
return f":{obj.name}"
if isinstance(obj, Lambda):
params = " ".join(obj.params)
body = _serialize_for_hash(obj.body)
return f"(fn [{params}] {body})"
if isinstance(obj, Binding):
# analysis_ref can be a string, node ID, or dict - serialize it properly
if isinstance(obj.analysis_ref, str):
ref_str = f'"{obj.analysis_ref}"'
else:
ref_str = _serialize_for_hash(obj.analysis_ref)
return f"(bind {ref_str} :range [{obj.range_min} {obj.range_max}])"
if isinstance(obj, dict):
items = []
for k, v in sorted(obj.items()):
items.append(f":{k} {_serialize_for_hash(v)}")
return "{" + " ".join(items) + "}"
if isinstance(obj, list):
items = [_serialize_for_hash(x) for x in obj]
return "(" + " ".join(items) + ")"
return str(obj)
def _stable_hash(data: Any, cluster_key: str = None) -> str:
"""Create stable SHA3-256 hash from data using S-expression serialization."""
if cluster_key:
data = {"_cluster_key": cluster_key, "_data": data}
sexp_str = _serialize_for_hash(data)
return hashlib.sha3_256(sexp_str.encode()).hexdigest()
@dataclass
class PlanStep:
"""A step in the execution plan."""
step_id: str
node_type: str
config: Dict[str, Any]
inputs: List[str] # List of input step_ids
cache_id: str
level: int = 0
stage: Optional[str] = None # Stage this step belongs to
def to_sexp(self) -> List:
"""Convert to S-expression."""
sexp = [Symbol("step"), self.step_id]
# Add cache-id
sexp.extend([Keyword("cache-id"), self.cache_id])
# Add level if > 0
if self.level > 0:
sexp.extend([Keyword("level"), self.level])
# Add stage info if present
if self.stage:
sexp.extend([Keyword("stage"), self.stage])
# Add the node expression
node_sexp = [Symbol(self.node_type.lower())]
# Add config as keywords
for key, value in self.config.items():
# Convert Binding to sexp form
if isinstance(value, Binding):
value = [Symbol("bind"), value.analysis_ref,
Keyword("range"), [value.range_min, value.range_max]]
node_sexp.extend([Keyword(key), value])
# Add inputs if any
if self.inputs:
node_sexp.extend([Keyword("inputs"), self.inputs])
sexp.append(node_sexp)
return sexp
@dataclass
class StagePlan:
"""A stage in the execution plan."""
stage_name: str
steps: List[PlanStep]
requires: List[str] # Names of required stages
output_bindings: Dict[str, str] # binding_name -> cache_id of output
level: int = 0 # Stage level for parallel execution
@dataclass
class ExecutionPlanSexp:
"""Execution plan as S-expression."""
plan_id: str
steps: List[PlanStep]
output_step_id: str
source_hash: str = "" # CID of recipe source
params: Dict[str, Any] = field(default_factory=dict) # Resolved parameter values
params_hash: str = "" # Hash of params for quick comparison
inputs: Dict[str, str] = field(default_factory=dict) # name -> hash
analysis: Dict[str, Dict] = field(default_factory=dict) # name -> {times, values}
metadata: Dict[str, Any] = field(default_factory=dict)
stage_plans: List[StagePlan] = field(default_factory=list) # Stage-level plans
stage_order: List[str] = field(default_factory=list) # Topologically sorted stage names
stage_levels: Dict[str, int] = field(default_factory=dict) # stage_name -> level
effects_registry: Dict[str, Dict] = field(default_factory=dict) # effect_name -> {path, cid, ...}
minimal_primitives: bool = False # If True, interpreter uses only core primitives
def to_sexp(self) -> List:
"""Convert entire plan to S-expression."""
sexp = [Symbol("plan")]
# Metadata - purely content-addressed
sexp.extend([Keyword("id"), self.plan_id])
sexp.extend([Keyword("source-cid"), self.source_hash]) # CID of recipe source
# Parameters
if self.params:
sexp.extend([Keyword("params-hash"), self.params_hash])
params_sexp = [Symbol("params")]
for name, value in self.params.items():
params_sexp.append([Symbol(name), value])
sexp.append(params_sexp)
# Input bindings
if self.inputs:
inputs_sexp = [Symbol("inputs")]
for name, hash_val in self.inputs.items():
inputs_sexp.append([Symbol(name), hash_val])
sexp.append(inputs_sexp)
# Analysis data (for effect parameter bindings)
if self.analysis:
analysis_sexp = [Symbol("analysis")]
for name, data in self.analysis.items():
track_sexp = [Symbol(name)]
if isinstance(data, dict) and "_cache_id" in data:
track_sexp.extend([Keyword("cache-id"), data["_cache_id"]])
else:
if "times" in data:
track_sexp.extend([Keyword("times"), data["times"]])
if "values" in data:
track_sexp.extend([Keyword("values"), data["values"]])
analysis_sexp.append(track_sexp)
sexp.append(analysis_sexp)
# Stage information
if self.stage_plans:
stages_sexp = [Symbol("stages")]
for stage_plan in self.stage_plans:
stage_sexp = [
Keyword("name"), stage_plan.stage_name,
Keyword("level"), stage_plan.level,
]
if stage_plan.requires:
stage_sexp.extend([Keyword("requires"), stage_plan.requires])
if stage_plan.output_bindings:
outputs_sexp = []
for name, cache_id in stage_plan.output_bindings.items():
outputs_sexp.append([Symbol(name), Keyword("cache-id"), cache_id])
stage_sexp.extend([Keyword("outputs"), outputs_sexp])
stages_sexp.append(stage_sexp)
sexp.append(stages_sexp)
# Effects registry - for loading explicitly declared effects
if self.effects_registry:
registry_sexp = [Symbol("effects-registry")]
for name, info in self.effects_registry.items():
effect_sexp = [Symbol(name)]
if info.get("path"):
effect_sexp.extend([Keyword("path"), info["path"]])
if info.get("cid"):
effect_sexp.extend([Keyword("cid"), info["cid"]])
registry_sexp.append(effect_sexp)
sexp.append(registry_sexp)
# Minimal primitives flag
if self.minimal_primitives:
sexp.extend([Keyword("minimal-primitives"), True])
# Steps
for step in self.steps:
sexp.append(step.to_sexp())
# Output reference
sexp.extend([Keyword("output"), self.output_step_id])
return sexp
def to_string(self, pretty: bool = True) -> str:
"""Serialize plan to S-expression string."""
return serialize(self.to_sexp(), pretty=pretty)
def _expand_list_inputs(nodes: List[Dict]) -> List[Dict]:
"""
Expand LIST node inputs in SEQUENCE nodes.
When a SEQUENCE has a LIST as input, replace it with all the LIST's inputs.
LIST nodes that are referenced by non-SEQUENCE nodes (e.g., EFFECT chains)
are promoted to SEQUENCE nodes so they produce a concatenated output.
Unreferenced LIST nodes are removed.
"""
nodes_by_id = {n["id"]: n for n in nodes}
list_nodes = {n["id"]: n for n in nodes if n["type"] == "LIST"}
if not list_nodes:
return nodes
# Determine which LIST nodes are referenced by SEQUENCE vs other node types
list_consumed_by_seq = set()
list_referenced_by_other = set()
for node in nodes:
if node["type"] == "LIST":
continue
for inp in node.get("inputs", []):
if inp in list_nodes:
if node["type"] == "SEQUENCE":
list_consumed_by_seq.add(inp)
else:
list_referenced_by_other.add(inp)
result = []
for node in nodes:
if node["type"] == "LIST":
if node["id"] in list_referenced_by_other:
# Promote to SEQUENCE — non-SEQUENCE nodes reference this LIST
result.append({
"id": node["id"],
"type": "SEQUENCE",
"config": node.get("config", {}),
"inputs": node.get("inputs", []),
})
# Otherwise skip (consumed by SEQUENCE expansion or unreferenced)
continue
if node["type"] == "SEQUENCE":
# Expand any LIST inputs
new_inputs = []
for inp in node.get("inputs", []):
if inp in list_nodes:
# Replace LIST with its contents
new_inputs.extend(list_nodes[inp].get("inputs", []))
else:
new_inputs.append(inp)
# Create updated node
result.append({
**node,
"inputs": new_inputs,
})
else:
result.append(node)
return result
def _collapse_effect_chains(nodes: List[Dict], registry: Dict = None) -> List[Dict]:
"""
Collapse sequential effect chains into single COMPOUND nodes.
A chain is a sequence of single-input collapsible nodes where:
- Each node has exactly one input
- No node in the chain is referenced by multiple other nodes
- The chain ends at a boundary or multi-ref node
- No node in the chain is marked as temporal
Effects can declare :temporal true to prevent collapsing (e.g., reverse).
Returns a new node list with chains collapsed.
"""
if not nodes:
return nodes
registry = registry or {}
nodes_by_id = {n["id"]: n for n in nodes}
# Build reference counts: how many nodes reference each node as input
ref_count = {n["id"]: 0 for n in nodes}
for node in nodes:
for inp in node.get("inputs", []):
if inp in ref_count:
ref_count[inp] += 1
# Track which nodes are consumed by chains
consumed = set()
compound_nodes = []
def is_temporal(node: Dict) -> bool:
"""Check if a node is temporal (needs complete input)."""
config = node.get("config", {})
# Check node-level temporal flag
if config.get("temporal"):
return True
# Check effect registry for temporal flag
if node["type"] == "EFFECT":
effect_name = config.get("effect")
if effect_name:
effect_meta = registry.get("effects", {}).get(effect_name, {})
if effect_meta.get("temporal"):
return True
return False
def is_collapsible(node_id: str) -> bool:
"""Check if a node can be part of a chain."""
if node_id in consumed:
return False
node = nodes_by_id.get(node_id)
if not node:
return False
if node["type"] not in COLLAPSIBLE_TYPES:
return False
# Temporal effects can't be collapsed
if is_temporal(node):
return False
# Effects CAN be collapsed if they have an FFmpeg mapping
# Only fall back to Python interpreter if no mapping exists
config = node.get("config", {})
if node["type"] == "EFFECT":
effect_name = config.get("effect")
# Import here to avoid circular imports
from .ffmpeg_compiler import FFmpegCompiler
compiler = FFmpegCompiler()
if compiler.get_mapping(effect_name):
return True # Has FFmpeg mapping, can collapse
elif config.get("effect_path"):
return False # No FFmpeg mapping, has Python path, can't collapse
return True
def is_chain_boundary(node_id: str) -> bool:
"""Check if a node is a chain boundary (can't be collapsed into)."""
node = nodes_by_id.get(node_id)
if not node:
return True # Unknown node is a boundary
# Boundary if: it's a boundary type, or referenced by multiple nodes
return node["type"] in BOUNDARY_TYPES or ref_count.get(node_id, 0) > 1
def collect_chain(start_id: str) -> List[str]:
"""Collect a chain of collapsible nodes starting from start_id."""
chain = [start_id]
current = start_id
while True:
node = nodes_by_id[current]
inputs = node.get("inputs", [])
# Must have exactly one input
if len(inputs) != 1:
break
next_id = inputs[0]
# Stop if next is a boundary or already consumed
if is_chain_boundary(next_id) or not is_collapsible(next_id):
break
# Stop if next is referenced by others besides current
if ref_count.get(next_id, 0) > 1:
break
chain.append(next_id)
current = next_id
return chain
# Process nodes in reverse order (from outputs toward inputs)
# This ensures we find complete chains starting from their end
# First, topologically sort to get dependency order
sorted_ids = []
visited = set()
def topo_visit(node_id: str):
if node_id in visited:
return
visited.add(node_id)
node = nodes_by_id.get(node_id)
if node:
for inp in node.get("inputs", []):
topo_visit(inp)
sorted_ids.append(node_id)
for node in nodes:
topo_visit(node["id"])
# Process in reverse topological order (outputs first)
result_nodes = []
for node_id in reversed(sorted_ids):
node = nodes_by_id[node_id]
if node_id in consumed:
continue
if not is_collapsible(node_id):
# Keep boundary nodes as-is
result_nodes.append(node)
continue
# Check if this node is the start of a chain (output end)
# A node is a chain start if it's collapsible and either:
# - Referenced by a boundary node
# - Referenced by multiple nodes
# - Is the output node
# For now, collect chain going backwards from this node
chain = collect_chain(node_id)
if len(chain) == 1:
# Single node, no collapse needed
result_nodes.append(node)
continue
# Collapse the chain into a COMPOUND node
# Chain is [end, ..., start] order (backwards from output)
# The compound node:
# - Has the same ID as the chain end (for reference stability)
# - Takes input from what the chain start originally took
# - Has a filter_chain config with all the nodes in order
chain_start = chain[-1] # First to execute
chain_end = chain[0] # Last to execute
start_node = nodes_by_id[chain_start]
end_node = nodes_by_id[chain_end]
# Build filter chain config (in execution order: start to end)
filter_chain = []
for chain_node_id in reversed(chain):
chain_node = nodes_by_id[chain_node_id]
filter_chain.append({
"type": chain_node["type"],
"config": chain_node.get("config", {}),
})
compound_node = {
"id": chain_end, # Keep the end ID for reference stability
"type": "COMPOUND",
"config": {
"filter_chain": filter_chain,
# Include effects registry so executor can load only declared effects
"effects_registry": registry.get("effects", {}),
},
"inputs": start_node.get("inputs", []),
"name": f"compound_{len(filter_chain)}_effects",
}
result_nodes.append(compound_node)
# Mark all chain nodes as consumed
for chain_node_id in chain:
consumed.add(chain_node_id)
return result_nodes
def _expand_slice_on(
node: Dict,
analysis_data: Dict[str, Any],
registry: Dict,
sources: Dict[str, str] = None,
cluster_key: str = None,
encoding: Dict = None,
named_analysis: Dict = None,
) -> List[Dict]:
"""
Expand a SLICE_ON node into primitive SEGMENT + EFFECT + SEQUENCE nodes.
Supports two modes:
1. Legacy: :effect and :pattern parameters
2. Lambda: :fn parameter with reducer function
Lambda syntax:
(slice-on analysis
:times times
:init 0
:fn (lambda [acc i start end]
{:source video
:effects (if (odd? i) [invert] [])
:acc (inc acc)}))
When all beats produce composition-mode results (layers + compositor)
with the same layer structure, consecutive beats are automatically merged
into fewer compositions with time-varying parameter bindings. This can
reduce thousands of nodes to a handful.
Args:
node: The SLICE_ON node to expand
analysis_data: Analysis results containing times array
registry: Recipe registry with effect definitions
sources: Map of source names to node IDs
cluster_key: Optional cluster key for hashing
named_analysis: Mutable dict to inject synthetic analysis tracks into
Returns:
List of expanded nodes (segments, effects, sequence)
"""
from .evaluator import evaluate, EvalError
from .parser import Lambda, Symbol
config = node.get("config", {})
node_inputs = node.get("inputs", [])
sources = sources or {}
# Extract times
times_path = config.get("times_path", "times")
times = analysis_data
for key in times_path.split("."):
times = times[key]
if not times:
raise ValueError(f"No times found at path '{times_path}' in analysis")
# Default video input (first input after analysis)
default_video = node_inputs[0] if node_inputs else None
expanded_nodes = []
sequence_inputs = []
base_id = node["id"][:8]
# Check for lambda-based reducer
reducer_fn = config.get("fn")
if isinstance(reducer_fn, Lambda):
# Lambda mode - evaluate function for each slice
acc = config.get("init", 0)
slice_times = list(zip([0] + times[:-1], times))
# Frame-accurate timing calculation
# Align ALL times to frame boundaries to prevent accumulating drift
fps = (encoding or {}).get("fps", 30)
frame_duration = 1.0 / fps
# Get total duration from analysis data (beats analyzer includes this)
# Falls back to config target_duration for backwards compatibility
total_duration = analysis_data.get("duration") or config.get("target_duration")
# Pre-compute frame-aligned cumulative times
cumulative_frames = [0] # Start at frame 0
for t in times:
# Round to nearest frame boundary
frames = round(t * fps)
cumulative_frames.append(frames)
# If total duration known, ensure last segment extends to it exactly
if total_duration is not None:
target_frames = round(total_duration * fps)
if target_frames > cumulative_frames[-1]:
cumulative_frames[-1] = target_frames
# Pre-compute frame-aligned start times and durations for each slice
frame_aligned_starts = []
frame_aligned_durations = []
for i in range(len(cumulative_frames) - 1):
start_frames = cumulative_frames[i]
end_frames = cumulative_frames[i + 1]
frame_aligned_starts.append(start_frames * frame_duration)
frame_aligned_durations.append((end_frames - start_frames) * frame_duration)
# Phase 1: Evaluate all lambdas upfront
videos = config.get("videos", [])
all_results = []
all_timings = [] # (seg_start, seg_duration) per valid beat
original_indices = [] # original beat index for each result
for i, (start, end) in enumerate(slice_times):
if start >= end:
continue
# Build environment with sources, effects, and builtins
env = dict(sources)
# Add effect names so they can be referenced as symbols
for effect_name in registry.get("effects", {}):
env[effect_name] = effect_name
# Make :videos list available to lambda
if videos:
env["videos"] = videos
env["acc"] = acc
env["i"] = i
env["start"] = start
env["end"] = end
# Evaluate the reducer
result = evaluate([reducer_fn, Symbol("acc"), Symbol("i"),
Symbol("start"), Symbol("end")], env)
if not isinstance(result, dict):
raise ValueError(f"Reducer must return a dict, got {type(result)}")
# Extract accumulator
acc = result.get("acc", acc)
# Segment timing: use frame-aligned values to prevent drift
# Lambda can override with explicit start/duration/end
if result.get("start") is not None or result.get("duration") is not None or result.get("end") is not None:
# Explicit timing from lambda - use as-is
seg_start = result.get("start", start)
seg_duration = result.get("duration")
if seg_duration is None:
if result.get("end") is not None:
seg_duration = result["end"] - seg_start
else:
seg_duration = end - start
else:
# Default: use frame-aligned start and duration to prevent accumulated drift
seg_start = frame_aligned_starts[i] if i < len(frame_aligned_starts) else start
seg_duration = frame_aligned_durations[i] if i < len(frame_aligned_durations) else (end - start)
all_results.append(result)
all_timings.append((seg_start, seg_duration))
original_indices.append(i)
# Phase 2: Merge or expand
all_composition = (
len(all_results) > 1
and all("layers" in r for r in all_results)
and named_analysis is not None
)
if all_composition:
# All beats are composition mode — try to merge consecutive
# beats with the same layer structure
_merge_composition_beats(
all_results, all_timings, base_id, videos, registry,
expanded_nodes, sequence_inputs, named_analysis,
)
else:
# Fallback: expand each beat individually
for idx, result in enumerate(all_results):
orig_i = original_indices[idx]
seg_start, seg_duration = all_timings[idx]
if "layers" in result:
# COMPOSITION MODE — multi-source with per-layer effects + compositor
_expand_composition_beat(
result, orig_i, base_id, videos, registry,
seg_start, seg_duration, expanded_nodes, sequence_inputs,
)
else:
# SINGLE-SOURCE MODE (existing behavior)
source_name = result.get("source")
effects = result.get("effects", [])
# Resolve source to node ID
if isinstance(source_name, Symbol):
source_name = source_name.name
valid_node_ids = set(sources.values())
if source_name in sources:
video_input = sources[source_name]
elif source_name in valid_node_ids:
video_input = source_name
else:
video_input = default_video
# Create SEGMENT node
segment_id = f"{base_id}_seg_{orig_i:04d}"
segment_node = {
"id": segment_id,
"type": "SEGMENT",
"config": {
"start": seg_start,
"duration": seg_duration,
},
"inputs": [video_input],
}
expanded_nodes.append(segment_node)
# Apply effects chain
current_input = segment_id
for j, effect in enumerate(effects):
effect_name, effect_params = _parse_effect_spec(effect)
if not effect_name:
continue
effect_id = f"{base_id}_fx_{orig_i:04d}_{j}"
effect_entry = registry.get("effects", {}).get(effect_name, {})
effect_config = {
"effect": effect_name,
"effect_path": effect_entry.get("path"),
}
effect_config.update(effect_params)
effect_node = {
"id": effect_id,
"type": "EFFECT",
"config": effect_config,
"inputs": [current_input],
}
expanded_nodes.append(effect_node)
current_input = effect_id
sequence_inputs.append(current_input)
else:
# Legacy mode - :effect and :pattern
effect_name = config.get("effect")
effect_path = config.get("effect_path")
pattern = config.get("pattern", "all")
video_input = default_video
if not video_input:
raise ValueError("SLICE_ON requires video input")
slice_times = list(zip([0] + times[:-1], times))
for i, (start, end) in enumerate(slice_times):
if start >= end:
continue
# Determine if effect should be applied
apply_effect = False
if effect_name:
if pattern == "all":
apply_effect = True
elif pattern == "odd":
apply_effect = (i % 2 == 1)
elif pattern == "even":
apply_effect = (i % 2 == 0)
elif pattern == "alternate":
apply_effect = (i % 2 == 1)
# Create SEGMENT node
segment_id = f"{base_id}_seg_{i:04d}"
segment_node = {
"id": segment_id,
"type": "SEGMENT",
"config": {
"start": start,
"duration": end - start,
},
"inputs": [video_input],
}
expanded_nodes.append(segment_node)
if apply_effect:
effect_id = f"{base_id}_fx_{i:04d}"
effect_config = {"effect": effect_name}
if effect_path:
effect_config["effect_path"] = effect_path
effect_node = {
"id": effect_id,
"type": "EFFECT",
"config": effect_config,
"inputs": [segment_id],
}
expanded_nodes.append(effect_node)
sequence_inputs.append(effect_id)
else:
sequence_inputs.append(segment_id)
# Create LIST node to hold all slices (user must explicitly sequence them)
list_node = {
"id": node["id"], # Keep original ID for reference stability
"type": "LIST",
"config": {},
"inputs": sequence_inputs,
}
expanded_nodes.append(list_node)
return expanded_nodes
def _parse_effect_spec(effect):
"""Parse an effect spec into (name, params) from Symbol, string, or dict."""
from .parser import Symbol
effect_name = None
effect_params = {}
if isinstance(effect, Symbol):
effect_name = effect.name
elif isinstance(effect, str):
effect_name = effect
elif isinstance(effect, dict):
effect_name = effect.get("effect")
if isinstance(effect_name, Symbol):
effect_name = effect_name.name
for k, v in effect.items():
if k != "effect":
effect_params[k] = v
return effect_name, effect_params
def _expand_composition_beat(result, beat_idx, base_id, videos, registry,
seg_start, seg_duration, expanded_nodes, sequence_inputs):
"""
Expand a composition-mode beat into per-layer SEGMENT + EFFECT nodes
and a single composition EFFECT node.
Args:
result: Lambda result dict with 'layers' and optional 'compose'
beat_idx: Beat index for ID generation
base_id: Base ID prefix
videos: List of video node IDs from :videos config
registry: Recipe registry with effect definitions
seg_start: Segment start time
seg_duration: Segment duration
expanded_nodes: List to append generated nodes to
sequence_inputs: List to append final composition node ID to
"""
layers = result["layers"]
compose_spec = result.get("compose", {})
layer_outputs = []
for layer_idx, layer in enumerate(layers):
# Resolve video: integer index into videos list, or node ID string
video_ref = layer.get("video")
if isinstance(video_ref, (int, float)):
video_input = videos[int(video_ref)]
else:
video_input = str(video_ref)
# SEGMENT for this layer
segment_id = f"{base_id}_seg_{beat_idx:04d}_L{layer_idx}"
expanded_nodes.append({
"id": segment_id,
"type": "SEGMENT",
"config": {"start": seg_start, "duration": seg_duration},
"inputs": [video_input],
})
# Per-layer EFFECT chain
current = segment_id
for fx_idx, effect in enumerate(layer.get("effects", [])):
effect_name, effect_params = _parse_effect_spec(effect)
if not effect_name:
continue
effect_id = f"{base_id}_fx_{beat_idx:04d}_L{layer_idx}_{fx_idx}"
effect_entry = registry.get("effects", {}).get(effect_name, {})
config = {
"effect": effect_name,
"effect_path": effect_entry.get("path"),
}
config.update(effect_params)
expanded_nodes.append({
"id": effect_id,
"type": "EFFECT",
"config": config,
"inputs": [current],
})
current = effect_id
layer_outputs.append(current)
# Composition EFFECT node
compose_name = compose_spec.get("effect", "blend_multi")
compose_id = f"{base_id}_comp_{beat_idx:04d}"
compose_entry = registry.get("effects", {}).get(compose_name, {})
compose_config = {
"effect": compose_name,
"effect_path": compose_entry.get("path"),
"multi_input": True,
}
for k, v in compose_spec.items():
if k != "effect":
compose_config[k] = v
expanded_nodes.append({
"id": compose_id,
"type": "EFFECT",
"config": compose_config,
"inputs": layer_outputs,
})
sequence_inputs.append(compose_id)
def _fingerprint_composition(result):
"""Create a hashable fingerprint of a composition beat's layer structure.
Beats with the same fingerprint have the same video refs, effect names,
and compositor type — only parameter values differ. Such beats can be
merged into a single composition with time-varying bindings.
"""
layers = result.get("layers", [])
compose = result.get("compose", {})
layer_fps = []
for layer in layers:
video_ref = layer.get("video")
effect_names = tuple(
_parse_effect_spec(e)[0] for e in layer.get("effects", [])
)
layer_fps.append((video_ref, effect_names))
compose_name = compose.get("effect", "blend_multi")
# Include static compose params (excluding list-valued params like weights)
static_compose = tuple(sorted(
(k, v) for k, v in compose.items()
if k not in ("effect", "weights") and isinstance(v, (str, int, float, bool))
))
return (len(layers), tuple(layer_fps), compose_name, static_compose)
def _merge_composition_beats(
all_results, all_timings, base_id, videos, registry,
expanded_nodes, sequence_inputs, named_analysis,
):
"""Merge consecutive composition beats with the same layer structure.
Groups consecutive beats by structural fingerprint. Groups of 2+ beats
get merged into a single composition with synthetic analysis tracks for
time-varying parameters. Single beats use standard per-beat expansion.
"""
import sys
# Compute fingerprints
fingerprints = [_fingerprint_composition(r) for r in all_results]
# Group consecutive beats with the same fingerprint
groups = [] # list of (start_idx, end_idx_exclusive)
group_start = 0
for i in range(1, len(fingerprints)):
if fingerprints[i] != fingerprints[group_start]:
groups.append((group_start, i))
group_start = i
groups.append((group_start, len(fingerprints)))
print(f" Composition merging: {len(all_results)} beats -> {len(groups)} groups", file=sys.stderr)
for group_idx, (g_start, g_end) in enumerate(groups):
group_size = g_end - g_start
if group_size == 1:
# Single beat — use standard expansion
result = all_results[g_start]
seg_start, seg_duration = all_timings[g_start]
_expand_composition_beat(
result, g_start, base_id, videos, registry,
seg_start, seg_duration, expanded_nodes, sequence_inputs,
)
else:
# Merge group into one composition with time-varying bindings
_merge_composition_group(
all_results, all_timings,
list(range(g_start, g_end)),
base_id, group_idx, videos, registry,
expanded_nodes, sequence_inputs, named_analysis,
)
def _merge_composition_group(
all_results, all_timings, group_indices,
base_id, group_idx, videos, registry,
expanded_nodes, sequence_inputs, named_analysis,
):
"""Merge a group of same-structure composition beats into one composition.
Creates:
- One SEGMENT per layer (spanning full group duration)
- One EFFECT per layer with time-varying params via synthetic analysis tracks
- One compositor EFFECT with time-varying weights via synthetic tracks
"""
import sys
first = all_results[group_indices[0]]
layers = first["layers"]
compose_spec = first.get("compose", {})
num_layers = len(layers)
# Group timing
first_start = all_timings[group_indices[0]][0]
last_start, last_dur = all_timings[group_indices[-1]]
group_duration = (last_start + last_dur) - first_start
# Beat start times for synthetic tracks (absolute times)
beat_times = [float(all_timings[i][0]) for i in group_indices]
print(f" Group {group_idx}: {len(group_indices)} beats, "
f"{first_start:.1f}s -> {first_start + group_duration:.1f}s "
f"({num_layers} layers)", file=sys.stderr)
# --- Per-layer segments and effects ---
layer_outputs = []
for layer_idx in range(num_layers):
layer = layers[layer_idx]
# Resolve video input
video_ref = layer.get("video")
if isinstance(video_ref, (int, float)):
video_input = videos[int(video_ref)]
else:
video_input = str(video_ref)
# SEGMENT for this layer (full group duration)
segment_id = f"{base_id}_seg_G{group_idx:03d}_L{layer_idx}"
expanded_nodes.append({
"id": segment_id,
"type": "SEGMENT",
"config": {"start": first_start, "duration": group_duration},
"inputs": [video_input],
})
# Per-layer EFFECT chain
current = segment_id
effects = layer.get("effects", [])
for fx_idx, effect in enumerate(effects):
effect_name, first_params = _parse_effect_spec(effect)
if not effect_name:
continue
effect_id = f"{base_id}_fx_G{group_idx:03d}_L{layer_idx}_{fx_idx}"
effect_entry = registry.get("effects", {}).get(effect_name, {})
fx_config = {
"effect": effect_name,
"effect_path": effect_entry.get("path"),
}
# For each param, check if it varies across beats
for param_name, first_val in first_params.items():
values = []
for bi in group_indices:
beat_layer = all_results[bi]["layers"][layer_idx]
beat_effects = beat_layer.get("effects", [])
if fx_idx < len(beat_effects):
_, beat_params = _parse_effect_spec(beat_effects[fx_idx])
values.append(float(beat_params.get(param_name, first_val)))
else:
values.append(float(first_val))
# Check if all values are identical
if all(v == values[0] for v in values):
fx_config[param_name] = values[0]
else:
# Create synthetic analysis track
# Prefix with 'syn_' to ensure valid S-expression symbol
# (base_id may start with digits, which the parser splits)
track_name = f"syn_{base_id}_L{layer_idx}_fx{fx_idx}_{param_name}"
named_analysis[track_name] = {
"times": beat_times,
"values": values,
}
fx_config[param_name] = {
"_binding": True,
"source": track_name,
"feature": "values",
"range": [0.0, 1.0], # pass-through
}
expanded_nodes.append({
"id": effect_id,
"type": "EFFECT",
"config": fx_config,
"inputs": [current],
})
current = effect_id
layer_outputs.append(current)
# --- Compositor ---
compose_name = compose_spec.get("effect", "blend_multi")
compose_id = f"{base_id}_comp_G{group_idx:03d}"
compose_entry = registry.get("effects", {}).get(compose_name, {})
compose_config = {
"effect": compose_name,
"effect_path": compose_entry.get("path"),
"multi_input": True,
}
for k, v in compose_spec.items():
if k == "effect":
continue
if isinstance(v, list):
# List param (e.g., weights) — check each element
merged_list = []
for elem_idx in range(len(v)):
elem_values = []
for bi in group_indices:
beat_compose = all_results[bi].get("compose", {})
beat_v = beat_compose.get(k, v)
if isinstance(beat_v, list) and elem_idx < len(beat_v):
elem_values.append(float(beat_v[elem_idx]))
else:
elem_values.append(float(v[elem_idx]))
if all(ev == elem_values[0] for ev in elem_values):
merged_list.append(elem_values[0])
else:
track_name = f"syn_{base_id}_comp_{k}_{elem_idx}"
named_analysis[track_name] = {
"times": beat_times,
"values": elem_values,
}
merged_list.append({
"_binding": True,
"source": track_name,
"feature": "values",
"range": [0.0, 1.0],
})
compose_config[k] = merged_list
elif isinstance(v, (int, float)):
# Scalar param — check if it varies
values = []
for bi in group_indices:
beat_compose = all_results[bi].get("compose", {})
values.append(float(beat_compose.get(k, v)))
if all(val == values[0] for val in values):
compose_config[k] = values[0]
else:
track_name = f"syn_{base_id}_comp_{k}"
named_analysis[track_name] = {
"times": beat_times,
"values": values,
}
compose_config[k] = {
"_binding": True,
"source": track_name,
"feature": "values",
"range": [0.0, 1.0],
}
else:
# String or other — keep as-is
compose_config[k] = v
expanded_nodes.append({
"id": compose_id,
"type": "EFFECT",
"config": compose_config,
"inputs": layer_outputs,
})
sequence_inputs.append(compose_id)
def _parse_construct_params(params_list: list) -> tuple:
"""
Parse :params block in a construct definition.
Syntax:
(
(param_name :type string :default "value" :desc "description")
)
Returns:
(param_names, param_defaults) where param_names is a list of strings
and param_defaults is a dict of param_name -> default_value
"""
param_names = []
param_defaults = {}
for param_def in params_list:
if not isinstance(param_def, list) or len(param_def) < 1:
continue
# First element is the parameter name
first = param_def[0]
if isinstance(first, Symbol):
param_name = first.name
elif isinstance(first, str):
param_name = first
else:
continue
param_names.append(param_name)
# Parse keyword arguments
default = None
i = 1
while i < len(param_def):
item = param_def[i]
if isinstance(item, Keyword):
if i + 1 >= len(param_def):
break
kw_value = param_def[i + 1]
if item.name == "default":
default = kw_value
# We could also parse :type, :range, :choices, :desc here
i += 2
else:
i += 1
param_defaults[param_name] = default
return param_names, param_defaults
def _expand_construct(
node: Dict,
registry: Dict,
sources: Dict[str, str],
analysis_data: Dict[str, Dict],
recipe_dir: Path,
cluster_key: str = None,
encoding: Dict = None,
) -> List[Dict]:
"""
Expand a user-defined CONSTRUCT node.
Loads the construct definition from .sexp file, evaluates it with
the provided arguments, and converts the result into segment nodes.
Args:
node: The CONSTRUCT node to expand
registry: Recipe registry
sources: Map of source names to node IDs
analysis_data: Analysis results (analysis_id -> {times, values})
recipe_dir: Recipe directory for resolving paths
cluster_key: Optional cluster key for hashing
encoding: Encoding config
Returns:
List of expanded nodes (segments, effects, list)
"""
from .parser import parse_all, Symbol
from .evaluator import evaluate
config = node.get("config", {})
construct_name = config.get("construct_name")
construct_path = config.get("construct_path")
args = config.get("args", [])
# Load construct definition
full_path = recipe_dir / construct_path
if not full_path.exists():
raise ValueError(f"Construct file not found: {full_path}")
print(f" Loading construct: {construct_name} from {construct_path}", file=sys.stderr)
construct_text = full_path.read_text()
construct_sexp = parse_all(construct_text)
# Parse define-construct: (define-construct name "desc" (params...) body)
if not isinstance(construct_sexp, list):
construct_sexp = [construct_sexp]
# Process imports (effect, construct declarations) in the construct file
# These extend the registry for this construct's scope
local_registry = dict(registry) # Copy parent registry
construct_def = None
for expr in construct_sexp:
if isinstance(expr, list) and expr and isinstance(expr[0], Symbol):
form_name = expr[0].name
if form_name == "effect":
# (effect name :path "...")
effect_name = expr[1].name if isinstance(expr[1], Symbol) else expr[1]
# Parse kwargs
i = 2
kwargs = {}
while i < len(expr):
if isinstance(expr[i], Keyword):
kwargs[expr[i].name] = expr[i + 1] if i + 1 < len(expr) else None
i += 2
else:
i += 1
local_registry.setdefault("effects", {})[effect_name] = {
"path": kwargs.get("path"),
"cid": kwargs.get("cid"),
}
print(f" Construct imports effect: {effect_name}", file=sys.stderr)
elif form_name == "define-construct":
construct_def = expr
if not construct_def:
raise ValueError(f"No define-construct found in {construct_path}")
# Use local_registry instead of registry from here
registry = local_registry
# Parse define-construct - requires :params syntax:
# (define-construct name
# :params (
# (param1 :type string :default "value" :desc "description")
# )
# body)
#
# Legacy syntax (define-construct name "desc" (param1 param2) body) is not supported.
def_name = construct_def[1].name if isinstance(construct_def[1], Symbol) else construct_def[1]
params = [] # List of param names
param_defaults = {} # param_name -> default value
body = None
found_params = False
idx = 2
while idx < len(construct_def):
item = construct_def[idx]
if isinstance(item, Keyword) and item.name == "params":
# :params syntax
if idx + 1 >= len(construct_def):
raise ValueError(f"Construct '{def_name}': Missing params list after :params keyword")
params_list = construct_def[idx + 1]
params, param_defaults = _parse_construct_params(params_list)
found_params = True
idx += 2
elif isinstance(item, Keyword):
# Skip other keywords (like :desc)
idx += 2
elif isinstance(item, str):
# Skip description strings (but warn about legacy format)
print(f" Warning: Description strings in define-construct are deprecated", file=sys.stderr)
idx += 1
elif body is None:
# First non-keyword, non-string item is the body
if isinstance(item, list) and item:
first_elem = item[0]
# Check for legacy params syntax and reject it
if isinstance(first_elem, Symbol) and first_elem.name not in ("let", "let*", "if", "when", "do", "begin", "->", "map", "filter", "fn", "reduce", "nth"):
# Could be legacy params if all items are just symbols
if all(isinstance(p, Symbol) for p in item):
raise ValueError(
f"Construct '{def_name}': Legacy parameter syntax (param1 param2) is not supported. "
f"Use :params block instead."
)
body = item
idx += 1
else:
idx += 1
if body is None:
raise ValueError(f"No body found in define-construct {def_name}")
# Build environment with sources and analysis data
env = dict(sources)
# Add bindings from compiler (video-a, video-b, etc.)
if "bindings" in config:
env.update(config["bindings"])
# Add effect names so they can be referenced as symbols
for effect_name in registry.get("effects", {}):
env[effect_name] = effect_name
# Map analysis node IDs to their data with :times and :values
for analysis_id, data in analysis_data.items():
# Find the name this analysis was bound to
for name, node_id in sources.items():
if node_id == analysis_id or name.endswith("-data"):
env[name] = data
env[analysis_id] = data
# Apply param defaults first (for :params syntax)
for param_name, default_value in param_defaults.items():
if default_value is not None:
env[param_name] = default_value
# Bind positional args to params (overrides defaults)
param_names = [p.name if isinstance(p, Symbol) else p for p in params]
for i, param in enumerate(param_names):
if i < len(args):
arg = args[i]
# Resolve node IDs to their data if it's analysis
if isinstance(arg, str) and arg in analysis_data:
env[param] = analysis_data[arg]
else:
env[param] = arg
# Helper to resolve node IDs to analysis data recursively
def resolve_value(val):
"""Resolve node IDs and symbols in a value, including inside dicts/lists."""
if isinstance(val, str) and val in analysis_data:
return analysis_data[val]
elif isinstance(val, str) and val in env:
return env[val]
elif isinstance(val, Symbol):
if val.name in env:
return env[val.name]
return val
elif isinstance(val, dict):
return {k: resolve_value(v) for k, v in val.items()}
elif isinstance(val, list):
return [resolve_value(v) for v in val]
return val
# Validate and bind keyword arguments from the config (excluding internal keys)
# These may be S-expressions that need evaluation (e.g., lambdas)
# or Symbols that need resolution from bindings
internal_keys = {"construct_name", "construct_path", "args", "bindings"}
known_params = set(param_names) | set(param_defaults.keys())
for key, value in config.items():
if key not in internal_keys:
# Convert key to valid identifier (replace - with _) for checking
param_key = key.replace("-", "_")
if param_key not in known_params:
raise ValueError(
f"Construct '{def_name}': Unknown parameter '{key}'. "
f"Valid parameters are: {', '.join(sorted(known_params)) if known_params else '(none)'}"
)
# Evaluate if it's an expression (list starting with Symbol)
if isinstance(value, list) and value and isinstance(value[0], Symbol):
env[param_key] = evaluate(value, env)
elif isinstance(value, Symbol):
# Resolve Symbol from env/bindings, then resolve any node IDs in the value
if value.name in env:
env[param_key] = resolve_value(env[value.name])
else:
raise ValueError(f"Undefined symbol in construct arg: {value.name}")
else:
# Resolve node IDs inside dicts/lists
env[param_key] = resolve_value(value)
# Evaluate construct body
print(f" Evaluating construct with params: {param_names}", file=sys.stderr)
segments = evaluate(body, env)
if not isinstance(segments, list):
raise ValueError(f"Construct must return a list of segments, got {type(segments)}")
print(f" Construct produced {len(segments)} segments", file=sys.stderr)
# Convert segment descriptors to plan nodes
expanded_nodes = []
sequence_inputs = []
base_id = node["id"][:8]
for i, seg in enumerate(segments):
if not isinstance(seg, dict):
continue
source_ref = seg.get("source")
start = seg.get("start", 0)
print(f" DEBUG segment {i}: source={str(source_ref)[:20]}... start={start}", file=sys.stderr)
end = seg.get("end")
duration = seg.get("duration") or (end - start if end else 1.0)
effects = seg.get("effects", [])
# Resolve source reference to node ID
source_id = sources.get(source_ref, source_ref) if isinstance(source_ref, str) else source_ref
# Create segment node
segment_id = f"{base_id}_seg_{i:04d}"
segment_node = {
"id": segment_id,
"type": "SEGMENT",
"config": {
"start": start,
"duration": duration,
},
"inputs": [source_id] if source_id else [],
}
expanded_nodes.append(segment_node)
# Add effects if specified
if effects:
prev_id = segment_id
for j, eff in enumerate(effects):
effect_name = eff.get("effect") if isinstance(eff, dict) else eff
effect_id = f"{base_id}_fx_{i:04d}_{j:02d}"
# Look up effect_path from registry (prevents collapsing Python effects)
effect_entry = registry.get("effects", {}).get(effect_name, {})
effect_config = {
"effect": effect_name,
**{k: v for k, v in (eff.items() if isinstance(eff, dict) else []) if k != "effect"},
}
if effect_entry.get("path"):
effect_config["effect_path"] = effect_entry["path"]
effect_node = {
"id": effect_id,
"type": "EFFECT",
"config": effect_config,
"inputs": [prev_id],
}
expanded_nodes.append(effect_node)
prev_id = effect_id
sequence_inputs.append(prev_id)
else:
sequence_inputs.append(segment_id)
# Create LIST node
list_node = {
"id": node["id"],
"type": "LIST",
"config": {},
"inputs": sequence_inputs,
}
expanded_nodes.append(list_node)
return expanded_nodes
def _expand_nodes(
nodes: List[Dict],
registry: Dict,
recipe_dir: Path,
source_paths: Dict[str, Path],
work_dir: Path = None,
cluster_key: str = None,
on_analysis: Callable[[str, Dict], None] = None,
encoding: Dict = None,
pre_analysis: Dict[str, Dict] = None,
) -> List[Dict]:
"""
Expand dynamic nodes (SLICE_ON) by running analyzers.
Processes nodes in dependency order:
1. SOURCE nodes: resolve file paths
2. SEGMENT nodes: pre-execute if needed for analysis
3. ANALYZE nodes: run analyzers (or use pre_analysis), store results
4. SLICE_ON nodes: expand using analysis results
Args:
nodes: List of compiled nodes
registry: Recipe registry
recipe_dir: Directory for resolving relative paths
source_paths: Resolved source paths (id -> path)
work_dir: Working directory for temporary files (created if None)
cluster_key: Optional cluster key
on_analysis: Callback when analysis completes (node_id, results)
pre_analysis: Pre-computed analysis data (name -> results)
Returns:
Tuple of (expanded_nodes, named_analysis) where:
- expanded_nodes: List with SLICE_ON replaced by primitives
- named_analysis: Dict of analyzer_name -> {times, values}
"""
import tempfile
nodes_by_id = {n["id"]: n for n in nodes}
sorted_ids = _topological_sort(nodes)
# Create work directory if needed
if work_dir is None:
work_dir = Path(tempfile.mkdtemp(prefix="artdag_plan_"))
# Track outputs and analysis results
outputs = {} # node_id -> output path or analysis data
analysis_results = {} # node_id -> analysis dict
named_analysis = {} # analyzer_name -> analysis dict (for effect bindings)
pre_executed = set() # nodes pre-executed during planning
expanded = []
expanded_ids = set()
for node_id in sorted_ids:
node = nodes_by_id[node_id]
node_type = node["type"]
if node_type == "SOURCE":
# Resolve source path
config = node.get("config", {})
if "path" in config:
path = recipe_dir / config["path"]
outputs[node_id] = path.resolve()
source_paths[node_id] = outputs[node_id]
expanded.append(node)
expanded_ids.add(node_id)
elif node_type == "SEGMENT":
# Check if this segment's input is resolved
inputs = node.get("inputs", [])
if inputs and inputs[0] in outputs:
input_path = outputs[inputs[0]]
if isinstance(input_path, Path):
# Skip pre-execution if config contains unresolved bindings
seg_config = node.get("config", {})
has_binding = any(
isinstance(v, Binding) or (isinstance(v, dict) and v.get("_binding"))
for v in [seg_config.get("start"), seg_config.get("duration"), seg_config.get("end")]
if v is not None
)
if not has_binding:
# Pre-execute segment to get output path
# This is needed if ANALYZE depends on this segment
import sys
print(f" Pre-executing segment: {node_id[:16]}...", file=sys.stderr)
output_path = _pre_execute_segment(node, input_path, work_dir)
outputs[node_id] = output_path
pre_executed.add(node_id)
expanded.append(node)
expanded_ids.add(node_id)
elif node_type == "ANALYZE":
# Get or run analysis
config = node.get("config", {})
analysis_name = node.get("name") or config.get("analyzer")
# Check for pre-computed analysis first
if pre_analysis and analysis_name and analysis_name in pre_analysis:
import sys
print(f" Using pre-computed analysis: {analysis_name}", file=sys.stderr)
results = pre_analysis[analysis_name]
else:
# Run analyzer to get concrete data
analyzer_path = config.get("analyzer_path")
node_inputs = node.get("inputs", [])
if not node_inputs:
raise ValueError(f"ANALYZE node {node_id} has no inputs")
# Get input path - could be SOURCE or pre-executed SEGMENT
input_id = node_inputs[0]
input_path = outputs.get(input_id)
if input_path is None:
raise ValueError(
f"ANALYZE input {input_id} not resolved. "
"Check that input SOURCE or SEGMENT exists."
)
if not isinstance(input_path, Path):
raise ValueError(
f"ANALYZE input {input_id} is not a file path: {type(input_path)}"
)
if analyzer_path:
full_path = recipe_dir / analyzer_path
params = {k: v for k, v in config.items()
if k not in ("analyzer", "analyzer_path", "cid")}
import sys
print(f" Running analyzer: {config.get('analyzer', 'unknown')}", file=sys.stderr)
results = _run_analyzer(full_path, input_path, params)
else:
raise ValueError(f"ANALYZE node {node_id} missing analyzer_path")
analysis_results[node_id] = results
outputs[node_id] = results
# Store by name for effect binding resolution
if analysis_name:
named_analysis[analysis_name] = results
if on_analysis:
on_analysis(node_id, results)
# Keep ANALYZE node in plan (it produces a JSON artifact)
expanded.append(node)
expanded_ids.add(node_id)
elif node_type == "SLICE_ON":
# Expand into primitives using analysis results
inputs = node.get("inputs", [])
config = node.get("config", {})
# Lambda mode can have just 1 input (analysis), legacy needs 2 (video + analysis)
has_lambda = "fn" in config
if has_lambda:
if len(inputs) < 1:
raise ValueError(f"SLICE_ON {node_id} requires analysis input")
analysis_id = inputs[0] # First input is analysis
else:
if len(inputs) < 2:
raise ValueError(f"SLICE_ON {node_id} requires video and analysis inputs")
analysis_id = inputs[1]
if analysis_id not in analysis_results:
raise ValueError(
f"SLICE_ON {node_id} analysis input {analysis_id} not found"
)
# Build sources map: name -> node_id
# This lets the lambda reference videos by name
sources = {}
for n in nodes:
if n.get("name"):
sources[n["name"]] = n["id"]
analysis_data = analysis_results[analysis_id]
slice_nodes = _expand_slice_on(node, analysis_data, registry, sources, cluster_key, encoding, named_analysis)
for sn in slice_nodes:
if sn["id"] not in expanded_ids:
expanded.append(sn)
expanded_ids.add(sn["id"])
elif node_type == "CONSTRUCT":
# Expand user-defined construct
config = node.get("config", {})
construct_name = config.get("construct_name")
construct_path = config.get("construct_path")
if not construct_path:
raise ValueError(f"CONSTRUCT {node_id} missing path")
# Build sources map
sources = {}
for n in nodes:
if n.get("name"):
sources[n["name"]] = n["id"]
# Get analysis data if referenced
inputs = node.get("inputs", [])
analysis_data = {}
for inp in inputs:
if inp in analysis_results:
analysis_data[inp] = analysis_results[inp]
construct_nodes = _expand_construct(
node, registry, sources, analysis_data, recipe_dir, cluster_key, encoding
)
for cn in construct_nodes:
if cn["id"] not in expanded_ids:
expanded.append(cn)
expanded_ids.add(cn["id"])
else:
# Keep other nodes as-is
expanded.append(node)
expanded_ids.add(node_id)
return expanded, named_analysis
def create_plan(
recipe: CompiledRecipe,
inputs: Dict[str, str] = None,
recipe_dir: Path = None,
cluster_key: str = None,
on_analysis: Callable[[str, Dict], None] = None,
pre_analysis: Dict[str, Dict] = None,
) -> ExecutionPlanSexp:
"""
Create an execution plan from a compiled recipe.
Args:
recipe: Compiled S-expression recipe
inputs: Mapping of input names to content hashes
recipe_dir: Directory for resolving relative paths (required for analyzers)
cluster_key: Optional cluster key for cache isolation
on_analysis: Callback when analysis completes (node_id, results)
pre_analysis: Pre-computed analysis data (name -> results), skips running analyzers
Returns:
ExecutionPlanSexp with all cache IDs computed
Example:
>>> recipe = compile_string('(recipe "test" (-> (source cat) (effect identity)))')
>>> plan = create_plan(recipe, inputs={}, recipe_dir=Path("."))
>>> print(plan.to_string())
"""
inputs = inputs or {}
# Compute source hash as CID (SHA256 of raw bytes) - this IS the content address
source_hash = hashlib.sha256(recipe.source_text.encode('utf-8')).hexdigest() if recipe.source_text else ""
# Compute params hash (use JSON + SHA256 for consistency with cache.py)
if recipe.resolved_params:
import json
params_str = json.dumps(recipe.resolved_params, sort_keys=True, default=str)
params_hash = hashlib.sha256(params_str.encode()).hexdigest()
else:
params_hash = ""
# Check if recipe has expandable nodes (SLICE_ON, etc.)
has_expandable = any(n["type"] in EXPANDABLE_TYPES for n in recipe.nodes)
named_analysis = {}
if has_expandable:
if recipe_dir is None:
raise ValueError("recipe_dir required for recipes with SLICE_ON nodes")
# Expand dynamic nodes (runs analyzers, expands SLICE_ON)
source_paths = {}
expanded_nodes, named_analysis = _expand_nodes(
recipe.nodes,
recipe.registry,
recipe_dir,
source_paths,
cluster_key=cluster_key,
on_analysis=on_analysis,
encoding=recipe.encoding,
pre_analysis=pre_analysis,
)
# Expand LIST inputs in SEQUENCE nodes
expanded_nodes = _expand_list_inputs(expanded_nodes)
# Collapse effect chains after expansion
collapsed_nodes = _collapse_effect_chains(expanded_nodes, recipe.registry)
else:
# No expansion needed
collapsed_nodes = _collapse_effect_chains(recipe.nodes, recipe.registry)
# Build node lookup from collapsed nodes
nodes_by_id = {node["id"]: node for node in collapsed_nodes}
# Topological sort
sorted_ids = _topological_sort(collapsed_nodes)
# Create steps with resolved hashes
steps = []
cache_ids = {} # step_id -> cache_id
for node_id in sorted_ids:
node = nodes_by_id[node_id]
step = _create_step(
node,
recipe.registry,
inputs,
cache_ids,
cluster_key,
)
steps.append(step)
cache_ids[node_id] = step.cache_id
# Compute levels
_compute_levels(steps, nodes_by_id)
# Handle stage-aware planning if recipe has stages
stage_plans = []
stage_order = []
stage_levels = {}
if recipe.stages:
# Build mapping from node_id to stage
node_to_stage = {}
for stage in recipe.stages:
for node_id in stage.node_ids:
node_to_stage[node_id] = stage.name
# Compute stage levels (for parallel execution)
stage_levels = _compute_stage_levels(recipe.stages)
# Tag each step with stage info
for step in steps:
if step.step_id in node_to_stage:
step.stage = node_to_stage[step.step_id]
# Build stage plans
for stage_name in recipe.stage_order:
stage = next(s for s in recipe.stages if s.name == stage_name)
stage_steps = [s for s in steps if s.stage == stage_name]
# Build output bindings with cache IDs
output_cache_ids = {}
for out_name, node_id in stage.output_bindings.items():
if node_id in cache_ids:
output_cache_ids[out_name] = cache_ids[node_id]
stage_plans.append(StagePlan(
stage_name=stage_name,
steps=stage_steps,
requires=stage.requires,
output_bindings=output_cache_ids,
level=stage_levels.get(stage_name, 0),
))
stage_order = recipe.stage_order
# Compute plan ID from source CID + steps
plan_content = {
"source_cid": source_hash,
"steps": [{"id": s.step_id, "cache_id": s.cache_id} for s in steps],
"inputs": inputs,
}
plan_id = _stable_hash(plan_content, cluster_key)
return ExecutionPlanSexp(
plan_id=plan_id,
source_hash=source_hash,
params=recipe.resolved_params,
params_hash=params_hash,
steps=steps,
output_step_id=recipe.output_node_id,
inputs=inputs,
analysis=named_analysis,
stage_plans=stage_plans,
stage_order=stage_order,
stage_levels=stage_levels,
effects_registry=recipe.registry.get("effects", {}),
minimal_primitives=recipe.minimal_primitives,
)
def _topological_sort(nodes: List[Dict]) -> List[str]:
"""Sort nodes in dependency order."""
nodes_by_id = {n["id"]: n for n in nodes}
visited = set()
order = []
def visit(node_id: str):
if node_id in visited:
return
visited.add(node_id)
node = nodes_by_id.get(node_id)
if node:
for input_id in node.get("inputs", []):
visit(input_id)
order.append(node_id)
for node in nodes:
visit(node["id"])
return order
def _create_step(
node: Dict,
registry: Dict,
inputs: Dict[str, str],
cache_ids: Dict[str, str],
cluster_key: str = None,
) -> PlanStep:
"""Create a PlanStep from a node definition."""
node_id = node["id"]
node_type = node["type"]
config = dict(node.get("config", {}))
node_inputs = node.get("inputs", [])
# Resolve registry references
resolved_config = _resolve_config(config, registry, inputs)
# Get input cache IDs (direct graph inputs)
input_cache_ids = [cache_ids[inp] for inp in node_inputs if inp in cache_ids]
# Also include analysis_refs as dependencies (for binding resolution)
# These are implicit inputs that affect the computation result
analysis_refs = resolved_config.get("analysis_refs", [])
analysis_cache_ids = [cache_ids[ref] for ref in analysis_refs if ref in cache_ids]
# Compute cache ID including both inputs and analysis dependencies
cache_content = {
"node_type": node_type,
"config": resolved_config,
"inputs": sorted(input_cache_ids + analysis_cache_ids),
}
cache_id = _stable_hash(cache_content, cluster_key)
return PlanStep(
step_id=node_id,
node_type=node_type,
config=resolved_config,
inputs=node_inputs,
cache_id=cache_id,
)
def _resolve_config(
config: Dict,
registry: Dict,
inputs: Dict[str, str],
) -> Dict:
"""Resolve registry references in config to content hashes."""
resolved = {}
for key, value in config.items():
if key == "filter_chain" and isinstance(value, list):
# Resolve each filter in the chain (for COMPOUND nodes)
resolved_chain = []
for filter_item in value:
filter_config = filter_item.get("config", {})
resolved_filter_config = _resolve_config(filter_config, registry, inputs)
resolved_chain.append({
"type": filter_item["type"],
"config": resolved_filter_config,
})
resolved["filter_chain"] = resolved_chain
elif key == "asset" and isinstance(value, str):
# Resolve asset reference - use CID from registry
if value in registry.get("assets", {}):
resolved["cid"] = registry["assets"][value]["cid"]
else:
resolved["asset"] = value # Keep as-is if not in registry
elif key == "effect" and isinstance(value, str):
# Resolve effect reference - keep name AND add CID/path
resolved["effect"] = value
if value in registry.get("effects", {}):
effect_entry = registry["effects"][value]
if effect_entry.get("cid"):
resolved["cid"] = effect_entry["cid"]
if effect_entry.get("path"):
resolved["effect_path"] = effect_entry["path"]
elif key == "input" and value is True:
# Variable input - resolve from inputs dict
input_name = config.get("name", "input")
if input_name in inputs:
resolved["hash"] = inputs[input_name]
else:
resolved["input"] = True
resolved["name"] = input_name
elif key == "path":
# Local file path - keep as-is for local execution
resolved["path"] = value
else:
resolved[key] = value
return resolved
def _compute_levels(steps: List[PlanStep], nodes_by_id: Dict) -> None:
"""Compute dependency levels for steps.
Considers both inputs (data dependencies) and analysis_refs (binding dependencies).
"""
levels = {}
def compute_level(step_id: str) -> int:
if step_id in levels:
return levels[step_id]
node = nodes_by_id.get(step_id)
if not node:
levels[step_id] = 0
return 0
# Collect all dependencies: inputs + analysis_refs
deps = list(node.get("inputs", []))
# Add analysis_refs as dependencies (for bindings to analysis data)
config = node.get("config", {})
analysis_refs = config.get("analysis_refs", [])
deps.extend(analysis_refs)
if not deps:
levels[step_id] = 0
return 0
max_dep = max(compute_level(dep) for dep in deps)
levels[step_id] = max_dep + 1
return levels[step_id]
for step in steps:
step.level = compute_level(step.step_id)
def _compute_stage_levels(stages: List) -> Dict[str, int]:
"""
Compute stage levels for parallel execution.
Stages at the same level have no dependencies between them
and can run in parallel.
"""
from .compiler import CompiledStage
levels = {}
def compute_level(stage_name: str) -> int:
if stage_name in levels:
return levels[stage_name]
stage = next((s for s in stages if s.name == stage_name), None)
if not stage or not stage.requires:
levels[stage_name] = 0
return 0
max_req = max(compute_level(req) for req in stage.requires)
levels[stage_name] = max_req + 1
return levels[stage_name]
for stage in stages:
compute_level(stage.name)
return levels
def step_to_task_sexp(step: PlanStep) -> List:
"""
Convert a step to a minimal S-expression for Celery task.
This is the S-expression that gets sent to a worker.
The worker hashes this to verify cache_id.
"""
sexp = [Symbol(step.node_type.lower())]
# Add resolved config
for key, value in step.config.items():
sexp.extend([Keyword(key), value])
# Add input cache IDs (not step IDs)
if step.inputs:
sexp.extend([Keyword("inputs"), step.inputs])
return sexp
def task_cache_id(task_sexp: List, cluster_key: str = None) -> str:
"""
Compute cache ID from task S-expression.
This allows workers to verify they're executing the right task.
"""
# Serialize S-expression to canonical form
canonical = serialize(task_sexp)
return _stable_hash({"sexp": canonical}, cluster_key)