""" Execution plan generation from S-expression recipes. The planner: 1. Takes a compiled recipe + input content hashes 2. Runs analyzers to get concrete data (beat times, etc.) 3. Expands dynamic nodes (SLICE_ON) into primitive operations 4. Resolves all registry references to content hashes 5. Generates an execution plan with pre-computed cache IDs Plans are S-expressions with all references resolved to hashes, ready for distribution to Celery workers. """ import hashlib import importlib.util import json import sys from dataclasses import dataclass, field from pathlib import Path from typing import Any, Dict, List, Optional, Callable from .parser import Symbol, Keyword, Binding, serialize from .compiler import CompiledRecipe # Node types that can be collapsed into a single FFmpeg filter chain COLLAPSIBLE_TYPES = {"EFFECT", "SEGMENT"} # Node types that are boundaries (sources, merges, or special processing) BOUNDARY_TYPES = {"SOURCE", "SEQUENCE", "MUX", "ANALYZE", "SCAN", "LIST"} # Node types that need expansion during planning EXPANDABLE_TYPES = {"SLICE_ON", "CONSTRUCT"} def _load_module(module_path: Path, module_name: str = "module"): """Load a Python module from file path.""" spec = importlib.util.spec_from_file_location(module_name, module_path) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module) return module def _run_analyzer( analyzer_path: Path, input_path: Path, params: Dict[str, Any], ) -> Dict[str, Any]: """Run an analyzer module and return results.""" analyzer = _load_module(analyzer_path, "analyzer") return analyzer.analyze(input_path, params) def _pre_execute_segment( node: Dict, input_path: Path, work_dir: Path, ) -> Path: """ Pre-execute a SEGMENT node during planning. This is needed when ANALYZE depends on a SEGMENT output. Returns path to the segmented file. """ import subprocess import tempfile config = node.get("config", {}) start = config.get("start", 0) duration = config.get("duration") end = config.get("end") # Detect if input is audio-only suffix = input_path.suffix.lower() is_audio = suffix in ('.mp3', '.wav', '.flac', '.aac', '.ogg', '.m4a') if is_audio: output_ext = ".m4a" # Use m4a for aac codec else: output_ext = ".mp4" output_path = work_dir / f"segment_{node['id'][:16]}{output_ext}" cmd = ["ffmpeg", "-y", "-i", str(input_path)] if start: cmd.extend(["-ss", str(start)]) if duration: cmd.extend(["-t", str(duration)]) elif end: cmd.extend(["-t", str(end - start)]) if is_audio: cmd.extend(["-c:a", "aac", str(output_path)]) else: cmd.extend(["-c:v", "libx264", "-preset", "fast", "-crf", "18", "-c:a", "aac", str(output_path)]) subprocess.run(cmd, check=True, capture_output=True) return output_path def _serialize_for_hash(obj) -> str: """Serialize any value to canonical S-expression string for hashing.""" from .parser import Lambda if obj is None: return "nil" if isinstance(obj, bool): return "true" if obj else "false" if isinstance(obj, (int, float)): return str(obj) if isinstance(obj, str): escaped = obj.replace('\\', '\\\\').replace('"', '\\"') return f'"{escaped}"' if isinstance(obj, Symbol): return obj.name if isinstance(obj, Keyword): return f":{obj.name}" if isinstance(obj, Lambda): params = " ".join(obj.params) body = _serialize_for_hash(obj.body) return f"(fn [{params}] {body})" if isinstance(obj, Binding): # analysis_ref can be a string, node ID, or dict - serialize it properly if isinstance(obj.analysis_ref, str): ref_str = f'"{obj.analysis_ref}"' else: ref_str = _serialize_for_hash(obj.analysis_ref) return f"(bind {ref_str} :range [{obj.range_min} {obj.range_max}])" if isinstance(obj, dict): items = [] for k, v in sorted(obj.items()): items.append(f":{k} {_serialize_for_hash(v)}") return "{" + " ".join(items) + "}" if isinstance(obj, list): items = [_serialize_for_hash(x) for x in obj] return "(" + " ".join(items) + ")" return str(obj) def _stable_hash(data: Any, cluster_key: str = None) -> str: """Create stable SHA3-256 hash from data using S-expression serialization.""" if cluster_key: data = {"_cluster_key": cluster_key, "_data": data} sexp_str = _serialize_for_hash(data) return hashlib.sha3_256(sexp_str.encode()).hexdigest() @dataclass class PlanStep: """A step in the execution plan.""" step_id: str node_type: str config: Dict[str, Any] inputs: List[str] # List of input step_ids cache_id: str level: int = 0 stage: Optional[str] = None # Stage this step belongs to def to_sexp(self) -> List: """Convert to S-expression.""" sexp = [Symbol("step"), self.step_id] # Add cache-id sexp.extend([Keyword("cache-id"), self.cache_id]) # Add level if > 0 if self.level > 0: sexp.extend([Keyword("level"), self.level]) # Add stage info if present if self.stage: sexp.extend([Keyword("stage"), self.stage]) # Add the node expression node_sexp = [Symbol(self.node_type.lower())] # Add config as keywords for key, value in self.config.items(): # Convert Binding to sexp form if isinstance(value, Binding): value = [Symbol("bind"), value.analysis_ref, Keyword("range"), [value.range_min, value.range_max]] node_sexp.extend([Keyword(key), value]) # Add inputs if any if self.inputs: node_sexp.extend([Keyword("inputs"), self.inputs]) sexp.append(node_sexp) return sexp @dataclass class StagePlan: """A stage in the execution plan.""" stage_name: str steps: List[PlanStep] requires: List[str] # Names of required stages output_bindings: Dict[str, str] # binding_name -> cache_id of output level: int = 0 # Stage level for parallel execution @dataclass class ExecutionPlanSexp: """Execution plan as S-expression.""" plan_id: str steps: List[PlanStep] output_step_id: str source_hash: str = "" # CID of recipe source params: Dict[str, Any] = field(default_factory=dict) # Resolved parameter values params_hash: str = "" # Hash of params for quick comparison inputs: Dict[str, str] = field(default_factory=dict) # name -> hash analysis: Dict[str, Dict] = field(default_factory=dict) # name -> {times, values} metadata: Dict[str, Any] = field(default_factory=dict) stage_plans: List[StagePlan] = field(default_factory=list) # Stage-level plans stage_order: List[str] = field(default_factory=list) # Topologically sorted stage names stage_levels: Dict[str, int] = field(default_factory=dict) # stage_name -> level effects_registry: Dict[str, Dict] = field(default_factory=dict) # effect_name -> {path, cid, ...} minimal_primitives: bool = False # If True, interpreter uses only core primitives def to_sexp(self) -> List: """Convert entire plan to S-expression.""" sexp = [Symbol("plan")] # Metadata - purely content-addressed sexp.extend([Keyword("id"), self.plan_id]) sexp.extend([Keyword("source-cid"), self.source_hash]) # CID of recipe source # Parameters if self.params: sexp.extend([Keyword("params-hash"), self.params_hash]) params_sexp = [Symbol("params")] for name, value in self.params.items(): params_sexp.append([Symbol(name), value]) sexp.append(params_sexp) # Input bindings if self.inputs: inputs_sexp = [Symbol("inputs")] for name, hash_val in self.inputs.items(): inputs_sexp.append([Symbol(name), hash_val]) sexp.append(inputs_sexp) # Analysis data (for effect parameter bindings) if self.analysis: analysis_sexp = [Symbol("analysis")] for name, data in self.analysis.items(): track_sexp = [Symbol(name)] if isinstance(data, dict) and "_cache_id" in data: track_sexp.extend([Keyword("cache-id"), data["_cache_id"]]) else: if "times" in data: track_sexp.extend([Keyword("times"), data["times"]]) if "values" in data: track_sexp.extend([Keyword("values"), data["values"]]) analysis_sexp.append(track_sexp) sexp.append(analysis_sexp) # Stage information if self.stage_plans: stages_sexp = [Symbol("stages")] for stage_plan in self.stage_plans: stage_sexp = [ Keyword("name"), stage_plan.stage_name, Keyword("level"), stage_plan.level, ] if stage_plan.requires: stage_sexp.extend([Keyword("requires"), stage_plan.requires]) if stage_plan.output_bindings: outputs_sexp = [] for name, cache_id in stage_plan.output_bindings.items(): outputs_sexp.append([Symbol(name), Keyword("cache-id"), cache_id]) stage_sexp.extend([Keyword("outputs"), outputs_sexp]) stages_sexp.append(stage_sexp) sexp.append(stages_sexp) # Effects registry - for loading explicitly declared effects if self.effects_registry: registry_sexp = [Symbol("effects-registry")] for name, info in self.effects_registry.items(): effect_sexp = [Symbol(name)] if info.get("path"): effect_sexp.extend([Keyword("path"), info["path"]]) if info.get("cid"): effect_sexp.extend([Keyword("cid"), info["cid"]]) registry_sexp.append(effect_sexp) sexp.append(registry_sexp) # Minimal primitives flag if self.minimal_primitives: sexp.extend([Keyword("minimal-primitives"), True]) # Steps for step in self.steps: sexp.append(step.to_sexp()) # Output reference sexp.extend([Keyword("output"), self.output_step_id]) return sexp def to_string(self, pretty: bool = True) -> str: """Serialize plan to S-expression string.""" return serialize(self.to_sexp(), pretty=pretty) def _expand_list_inputs(nodes: List[Dict]) -> List[Dict]: """ Expand LIST node inputs in SEQUENCE nodes. When a SEQUENCE has a LIST as input, replace it with all the LIST's inputs. LIST nodes that are referenced by non-SEQUENCE nodes (e.g., EFFECT chains) are promoted to SEQUENCE nodes so they produce a concatenated output. Unreferenced LIST nodes are removed. """ nodes_by_id = {n["id"]: n for n in nodes} list_nodes = {n["id"]: n for n in nodes if n["type"] == "LIST"} if not list_nodes: return nodes # Determine which LIST nodes are referenced by SEQUENCE vs other node types list_consumed_by_seq = set() list_referenced_by_other = set() for node in nodes: if node["type"] == "LIST": continue for inp in node.get("inputs", []): if inp in list_nodes: if node["type"] == "SEQUENCE": list_consumed_by_seq.add(inp) else: list_referenced_by_other.add(inp) result = [] for node in nodes: if node["type"] == "LIST": if node["id"] in list_referenced_by_other: # Promote to SEQUENCE — non-SEQUENCE nodes reference this LIST result.append({ "id": node["id"], "type": "SEQUENCE", "config": node.get("config", {}), "inputs": node.get("inputs", []), }) # Otherwise skip (consumed by SEQUENCE expansion or unreferenced) continue if node["type"] == "SEQUENCE": # Expand any LIST inputs new_inputs = [] for inp in node.get("inputs", []): if inp in list_nodes: # Replace LIST with its contents new_inputs.extend(list_nodes[inp].get("inputs", [])) else: new_inputs.append(inp) # Create updated node result.append({ **node, "inputs": new_inputs, }) else: result.append(node) return result def _collapse_effect_chains(nodes: List[Dict], registry: Dict = None) -> List[Dict]: """ Collapse sequential effect chains into single COMPOUND nodes. A chain is a sequence of single-input collapsible nodes where: - Each node has exactly one input - No node in the chain is referenced by multiple other nodes - The chain ends at a boundary or multi-ref node - No node in the chain is marked as temporal Effects can declare :temporal true to prevent collapsing (e.g., reverse). Returns a new node list with chains collapsed. """ if not nodes: return nodes registry = registry or {} nodes_by_id = {n["id"]: n for n in nodes} # Build reference counts: how many nodes reference each node as input ref_count = {n["id"]: 0 for n in nodes} for node in nodes: for inp in node.get("inputs", []): if inp in ref_count: ref_count[inp] += 1 # Track which nodes are consumed by chains consumed = set() compound_nodes = [] def is_temporal(node: Dict) -> bool: """Check if a node is temporal (needs complete input).""" config = node.get("config", {}) # Check node-level temporal flag if config.get("temporal"): return True # Check effect registry for temporal flag if node["type"] == "EFFECT": effect_name = config.get("effect") if effect_name: effect_meta = registry.get("effects", {}).get(effect_name, {}) if effect_meta.get("temporal"): return True return False def is_collapsible(node_id: str) -> bool: """Check if a node can be part of a chain.""" if node_id in consumed: return False node = nodes_by_id.get(node_id) if not node: return False if node["type"] not in COLLAPSIBLE_TYPES: return False # Temporal effects can't be collapsed if is_temporal(node): return False # Effects CAN be collapsed if they have an FFmpeg mapping # Only fall back to Python interpreter if no mapping exists config = node.get("config", {}) if node["type"] == "EFFECT": effect_name = config.get("effect") # Import here to avoid circular imports from .ffmpeg_compiler import FFmpegCompiler compiler = FFmpegCompiler() if compiler.get_mapping(effect_name): return True # Has FFmpeg mapping, can collapse elif config.get("effect_path"): return False # No FFmpeg mapping, has Python path, can't collapse return True def is_chain_boundary(node_id: str) -> bool: """Check if a node is a chain boundary (can't be collapsed into).""" node = nodes_by_id.get(node_id) if not node: return True # Unknown node is a boundary # Boundary if: it's a boundary type, or referenced by multiple nodes return node["type"] in BOUNDARY_TYPES or ref_count.get(node_id, 0) > 1 def collect_chain(start_id: str) -> List[str]: """Collect a chain of collapsible nodes starting from start_id.""" chain = [start_id] current = start_id while True: node = nodes_by_id[current] inputs = node.get("inputs", []) # Must have exactly one input if len(inputs) != 1: break next_id = inputs[0] # Stop if next is a boundary or already consumed if is_chain_boundary(next_id) or not is_collapsible(next_id): break # Stop if next is referenced by others besides current if ref_count.get(next_id, 0) > 1: break chain.append(next_id) current = next_id return chain # Process nodes in reverse order (from outputs toward inputs) # This ensures we find complete chains starting from their end # First, topologically sort to get dependency order sorted_ids = [] visited = set() def topo_visit(node_id: str): if node_id in visited: return visited.add(node_id) node = nodes_by_id.get(node_id) if node: for inp in node.get("inputs", []): topo_visit(inp) sorted_ids.append(node_id) for node in nodes: topo_visit(node["id"]) # Process in reverse topological order (outputs first) result_nodes = [] for node_id in reversed(sorted_ids): node = nodes_by_id[node_id] if node_id in consumed: continue if not is_collapsible(node_id): # Keep boundary nodes as-is result_nodes.append(node) continue # Check if this node is the start of a chain (output end) # A node is a chain start if it's collapsible and either: # - Referenced by a boundary node # - Referenced by multiple nodes # - Is the output node # For now, collect chain going backwards from this node chain = collect_chain(node_id) if len(chain) == 1: # Single node, no collapse needed result_nodes.append(node) continue # Collapse the chain into a COMPOUND node # Chain is [end, ..., start] order (backwards from output) # The compound node: # - Has the same ID as the chain end (for reference stability) # - Takes input from what the chain start originally took # - Has a filter_chain config with all the nodes in order chain_start = chain[-1] # First to execute chain_end = chain[0] # Last to execute start_node = nodes_by_id[chain_start] end_node = nodes_by_id[chain_end] # Build filter chain config (in execution order: start to end) filter_chain = [] for chain_node_id in reversed(chain): chain_node = nodes_by_id[chain_node_id] filter_chain.append({ "type": chain_node["type"], "config": chain_node.get("config", {}), }) compound_node = { "id": chain_end, # Keep the end ID for reference stability "type": "COMPOUND", "config": { "filter_chain": filter_chain, # Include effects registry so executor can load only declared effects "effects_registry": registry.get("effects", {}), }, "inputs": start_node.get("inputs", []), "name": f"compound_{len(filter_chain)}_effects", } result_nodes.append(compound_node) # Mark all chain nodes as consumed for chain_node_id in chain: consumed.add(chain_node_id) return result_nodes def _expand_slice_on( node: Dict, analysis_data: Dict[str, Any], registry: Dict, sources: Dict[str, str] = None, cluster_key: str = None, encoding: Dict = None, named_analysis: Dict = None, ) -> List[Dict]: """ Expand a SLICE_ON node into primitive SEGMENT + EFFECT + SEQUENCE nodes. Supports two modes: 1. Legacy: :effect and :pattern parameters 2. Lambda: :fn parameter with reducer function Lambda syntax: (slice-on analysis :times times :init 0 :fn (lambda [acc i start end] {:source video :effects (if (odd? i) [invert] []) :acc (inc acc)})) When all beats produce composition-mode results (layers + compositor) with the same layer structure, consecutive beats are automatically merged into fewer compositions with time-varying parameter bindings. This can reduce thousands of nodes to a handful. Args: node: The SLICE_ON node to expand analysis_data: Analysis results containing times array registry: Recipe registry with effect definitions sources: Map of source names to node IDs cluster_key: Optional cluster key for hashing named_analysis: Mutable dict to inject synthetic analysis tracks into Returns: List of expanded nodes (segments, effects, sequence) """ from .evaluator import evaluate, EvalError from .parser import Lambda, Symbol config = node.get("config", {}) node_inputs = node.get("inputs", []) sources = sources or {} # Extract times times_path = config.get("times_path", "times") times = analysis_data for key in times_path.split("."): times = times[key] if not times: raise ValueError(f"No times found at path '{times_path}' in analysis") # Default video input (first input after analysis) default_video = node_inputs[0] if node_inputs else None expanded_nodes = [] sequence_inputs = [] base_id = node["id"][:8] # Check for lambda-based reducer reducer_fn = config.get("fn") if isinstance(reducer_fn, Lambda): # Lambda mode - evaluate function for each slice acc = config.get("init", 0) slice_times = list(zip([0] + times[:-1], times)) # Frame-accurate timing calculation # Align ALL times to frame boundaries to prevent accumulating drift fps = (encoding or {}).get("fps", 30) frame_duration = 1.0 / fps # Get total duration from analysis data (beats analyzer includes this) # Falls back to config target_duration for backwards compatibility total_duration = analysis_data.get("duration") or config.get("target_duration") # Pre-compute frame-aligned cumulative times cumulative_frames = [0] # Start at frame 0 for t in times: # Round to nearest frame boundary frames = round(t * fps) cumulative_frames.append(frames) # If total duration known, ensure last segment extends to it exactly if total_duration is not None: target_frames = round(total_duration * fps) if target_frames > cumulative_frames[-1]: cumulative_frames[-1] = target_frames # Pre-compute frame-aligned start times and durations for each slice frame_aligned_starts = [] frame_aligned_durations = [] for i in range(len(cumulative_frames) - 1): start_frames = cumulative_frames[i] end_frames = cumulative_frames[i + 1] frame_aligned_starts.append(start_frames * frame_duration) frame_aligned_durations.append((end_frames - start_frames) * frame_duration) # Phase 1: Evaluate all lambdas upfront videos = config.get("videos", []) all_results = [] all_timings = [] # (seg_start, seg_duration) per valid beat original_indices = [] # original beat index for each result for i, (start, end) in enumerate(slice_times): if start >= end: continue # Build environment with sources, effects, and builtins env = dict(sources) # Add effect names so they can be referenced as symbols for effect_name in registry.get("effects", {}): env[effect_name] = effect_name # Make :videos list available to lambda if videos: env["videos"] = videos env["acc"] = acc env["i"] = i env["start"] = start env["end"] = end # Evaluate the reducer result = evaluate([reducer_fn, Symbol("acc"), Symbol("i"), Symbol("start"), Symbol("end")], env) if not isinstance(result, dict): raise ValueError(f"Reducer must return a dict, got {type(result)}") # Extract accumulator acc = result.get("acc", acc) # Segment timing: use frame-aligned values to prevent drift # Lambda can override with explicit start/duration/end if result.get("start") is not None or result.get("duration") is not None or result.get("end") is not None: # Explicit timing from lambda - use as-is seg_start = result.get("start", start) seg_duration = result.get("duration") if seg_duration is None: if result.get("end") is not None: seg_duration = result["end"] - seg_start else: seg_duration = end - start else: # Default: use frame-aligned start and duration to prevent accumulated drift seg_start = frame_aligned_starts[i] if i < len(frame_aligned_starts) else start seg_duration = frame_aligned_durations[i] if i < len(frame_aligned_durations) else (end - start) all_results.append(result) all_timings.append((seg_start, seg_duration)) original_indices.append(i) # Phase 2: Merge or expand all_composition = ( len(all_results) > 1 and all("layers" in r for r in all_results) and named_analysis is not None ) if all_composition: # All beats are composition mode — try to merge consecutive # beats with the same layer structure _merge_composition_beats( all_results, all_timings, base_id, videos, registry, expanded_nodes, sequence_inputs, named_analysis, ) else: # Fallback: expand each beat individually for idx, result in enumerate(all_results): orig_i = original_indices[idx] seg_start, seg_duration = all_timings[idx] if "layers" in result: # COMPOSITION MODE — multi-source with per-layer effects + compositor _expand_composition_beat( result, orig_i, base_id, videos, registry, seg_start, seg_duration, expanded_nodes, sequence_inputs, ) else: # SINGLE-SOURCE MODE (existing behavior) source_name = result.get("source") effects = result.get("effects", []) # Resolve source to node ID if isinstance(source_name, Symbol): source_name = source_name.name valid_node_ids = set(sources.values()) if source_name in sources: video_input = sources[source_name] elif source_name in valid_node_ids: video_input = source_name else: video_input = default_video # Create SEGMENT node segment_id = f"{base_id}_seg_{orig_i:04d}" segment_node = { "id": segment_id, "type": "SEGMENT", "config": { "start": seg_start, "duration": seg_duration, }, "inputs": [video_input], } expanded_nodes.append(segment_node) # Apply effects chain current_input = segment_id for j, effect in enumerate(effects): effect_name, effect_params = _parse_effect_spec(effect) if not effect_name: continue effect_id = f"{base_id}_fx_{orig_i:04d}_{j}" effect_entry = registry.get("effects", {}).get(effect_name, {}) effect_config = { "effect": effect_name, "effect_path": effect_entry.get("path"), } effect_config.update(effect_params) effect_node = { "id": effect_id, "type": "EFFECT", "config": effect_config, "inputs": [current_input], } expanded_nodes.append(effect_node) current_input = effect_id sequence_inputs.append(current_input) else: # Legacy mode - :effect and :pattern effect_name = config.get("effect") effect_path = config.get("effect_path") pattern = config.get("pattern", "all") video_input = default_video if not video_input: raise ValueError("SLICE_ON requires video input") slice_times = list(zip([0] + times[:-1], times)) for i, (start, end) in enumerate(slice_times): if start >= end: continue # Determine if effect should be applied apply_effect = False if effect_name: if pattern == "all": apply_effect = True elif pattern == "odd": apply_effect = (i % 2 == 1) elif pattern == "even": apply_effect = (i % 2 == 0) elif pattern == "alternate": apply_effect = (i % 2 == 1) # Create SEGMENT node segment_id = f"{base_id}_seg_{i:04d}" segment_node = { "id": segment_id, "type": "SEGMENT", "config": { "start": start, "duration": end - start, }, "inputs": [video_input], } expanded_nodes.append(segment_node) if apply_effect: effect_id = f"{base_id}_fx_{i:04d}" effect_config = {"effect": effect_name} if effect_path: effect_config["effect_path"] = effect_path effect_node = { "id": effect_id, "type": "EFFECT", "config": effect_config, "inputs": [segment_id], } expanded_nodes.append(effect_node) sequence_inputs.append(effect_id) else: sequence_inputs.append(segment_id) # Create LIST node to hold all slices (user must explicitly sequence them) list_node = { "id": node["id"], # Keep original ID for reference stability "type": "LIST", "config": {}, "inputs": sequence_inputs, } expanded_nodes.append(list_node) return expanded_nodes def _parse_effect_spec(effect): """Parse an effect spec into (name, params) from Symbol, string, or dict.""" from .parser import Symbol effect_name = None effect_params = {} if isinstance(effect, Symbol): effect_name = effect.name elif isinstance(effect, str): effect_name = effect elif isinstance(effect, dict): effect_name = effect.get("effect") if isinstance(effect_name, Symbol): effect_name = effect_name.name for k, v in effect.items(): if k != "effect": effect_params[k] = v return effect_name, effect_params def _expand_composition_beat(result, beat_idx, base_id, videos, registry, seg_start, seg_duration, expanded_nodes, sequence_inputs): """ Expand a composition-mode beat into per-layer SEGMENT + EFFECT nodes and a single composition EFFECT node. Args: result: Lambda result dict with 'layers' and optional 'compose' beat_idx: Beat index for ID generation base_id: Base ID prefix videos: List of video node IDs from :videos config registry: Recipe registry with effect definitions seg_start: Segment start time seg_duration: Segment duration expanded_nodes: List to append generated nodes to sequence_inputs: List to append final composition node ID to """ layers = result["layers"] compose_spec = result.get("compose", {}) layer_outputs = [] for layer_idx, layer in enumerate(layers): # Resolve video: integer index into videos list, or node ID string video_ref = layer.get("video") if isinstance(video_ref, (int, float)): video_input = videos[int(video_ref)] else: video_input = str(video_ref) # SEGMENT for this layer segment_id = f"{base_id}_seg_{beat_idx:04d}_L{layer_idx}" expanded_nodes.append({ "id": segment_id, "type": "SEGMENT", "config": {"start": seg_start, "duration": seg_duration}, "inputs": [video_input], }) # Per-layer EFFECT chain current = segment_id for fx_idx, effect in enumerate(layer.get("effects", [])): effect_name, effect_params = _parse_effect_spec(effect) if not effect_name: continue effect_id = f"{base_id}_fx_{beat_idx:04d}_L{layer_idx}_{fx_idx}" effect_entry = registry.get("effects", {}).get(effect_name, {}) config = { "effect": effect_name, "effect_path": effect_entry.get("path"), } config.update(effect_params) expanded_nodes.append({ "id": effect_id, "type": "EFFECT", "config": config, "inputs": [current], }) current = effect_id layer_outputs.append(current) # Composition EFFECT node compose_name = compose_spec.get("effect", "blend_multi") compose_id = f"{base_id}_comp_{beat_idx:04d}" compose_entry = registry.get("effects", {}).get(compose_name, {}) compose_config = { "effect": compose_name, "effect_path": compose_entry.get("path"), "multi_input": True, } for k, v in compose_spec.items(): if k != "effect": compose_config[k] = v expanded_nodes.append({ "id": compose_id, "type": "EFFECT", "config": compose_config, "inputs": layer_outputs, }) sequence_inputs.append(compose_id) def _fingerprint_composition(result): """Create a hashable fingerprint of a composition beat's layer structure. Beats with the same fingerprint have the same video refs, effect names, and compositor type — only parameter values differ. Such beats can be merged into a single composition with time-varying bindings. """ layers = result.get("layers", []) compose = result.get("compose", {}) layer_fps = [] for layer in layers: video_ref = layer.get("video") effect_names = tuple( _parse_effect_spec(e)[0] for e in layer.get("effects", []) ) layer_fps.append((video_ref, effect_names)) compose_name = compose.get("effect", "blend_multi") # Include static compose params (excluding list-valued params like weights) static_compose = tuple(sorted( (k, v) for k, v in compose.items() if k not in ("effect", "weights") and isinstance(v, (str, int, float, bool)) )) return (len(layers), tuple(layer_fps), compose_name, static_compose) def _merge_composition_beats( all_results, all_timings, base_id, videos, registry, expanded_nodes, sequence_inputs, named_analysis, ): """Merge consecutive composition beats with the same layer structure. Groups consecutive beats by structural fingerprint. Groups of 2+ beats get merged into a single composition with synthetic analysis tracks for time-varying parameters. Single beats use standard per-beat expansion. """ import sys # Compute fingerprints fingerprints = [_fingerprint_composition(r) for r in all_results] # Group consecutive beats with the same fingerprint groups = [] # list of (start_idx, end_idx_exclusive) group_start = 0 for i in range(1, len(fingerprints)): if fingerprints[i] != fingerprints[group_start]: groups.append((group_start, i)) group_start = i groups.append((group_start, len(fingerprints))) print(f" Composition merging: {len(all_results)} beats -> {len(groups)} groups", file=sys.stderr) for group_idx, (g_start, g_end) in enumerate(groups): group_size = g_end - g_start if group_size == 1: # Single beat — use standard expansion result = all_results[g_start] seg_start, seg_duration = all_timings[g_start] _expand_composition_beat( result, g_start, base_id, videos, registry, seg_start, seg_duration, expanded_nodes, sequence_inputs, ) else: # Merge group into one composition with time-varying bindings _merge_composition_group( all_results, all_timings, list(range(g_start, g_end)), base_id, group_idx, videos, registry, expanded_nodes, sequence_inputs, named_analysis, ) def _merge_composition_group( all_results, all_timings, group_indices, base_id, group_idx, videos, registry, expanded_nodes, sequence_inputs, named_analysis, ): """Merge a group of same-structure composition beats into one composition. Creates: - One SEGMENT per layer (spanning full group duration) - One EFFECT per layer with time-varying params via synthetic analysis tracks - One compositor EFFECT with time-varying weights via synthetic tracks """ import sys first = all_results[group_indices[0]] layers = first["layers"] compose_spec = first.get("compose", {}) num_layers = len(layers) # Group timing first_start = all_timings[group_indices[0]][0] last_start, last_dur = all_timings[group_indices[-1]] group_duration = (last_start + last_dur) - first_start # Beat start times for synthetic tracks (absolute times) beat_times = [float(all_timings[i][0]) for i in group_indices] print(f" Group {group_idx}: {len(group_indices)} beats, " f"{first_start:.1f}s -> {first_start + group_duration:.1f}s " f"({num_layers} layers)", file=sys.stderr) # --- Per-layer segments and effects --- layer_outputs = [] for layer_idx in range(num_layers): layer = layers[layer_idx] # Resolve video input video_ref = layer.get("video") if isinstance(video_ref, (int, float)): video_input = videos[int(video_ref)] else: video_input = str(video_ref) # SEGMENT for this layer (full group duration) segment_id = f"{base_id}_seg_G{group_idx:03d}_L{layer_idx}" expanded_nodes.append({ "id": segment_id, "type": "SEGMENT", "config": {"start": first_start, "duration": group_duration}, "inputs": [video_input], }) # Per-layer EFFECT chain current = segment_id effects = layer.get("effects", []) for fx_idx, effect in enumerate(effects): effect_name, first_params = _parse_effect_spec(effect) if not effect_name: continue effect_id = f"{base_id}_fx_G{group_idx:03d}_L{layer_idx}_{fx_idx}" effect_entry = registry.get("effects", {}).get(effect_name, {}) fx_config = { "effect": effect_name, "effect_path": effect_entry.get("path"), } # For each param, check if it varies across beats for param_name, first_val in first_params.items(): values = [] for bi in group_indices: beat_layer = all_results[bi]["layers"][layer_idx] beat_effects = beat_layer.get("effects", []) if fx_idx < len(beat_effects): _, beat_params = _parse_effect_spec(beat_effects[fx_idx]) values.append(float(beat_params.get(param_name, first_val))) else: values.append(float(first_val)) # Check if all values are identical if all(v == values[0] for v in values): fx_config[param_name] = values[0] else: # Create synthetic analysis track # Prefix with 'syn_' to ensure valid S-expression symbol # (base_id may start with digits, which the parser splits) track_name = f"syn_{base_id}_L{layer_idx}_fx{fx_idx}_{param_name}" named_analysis[track_name] = { "times": beat_times, "values": values, } fx_config[param_name] = { "_binding": True, "source": track_name, "feature": "values", "range": [0.0, 1.0], # pass-through } expanded_nodes.append({ "id": effect_id, "type": "EFFECT", "config": fx_config, "inputs": [current], }) current = effect_id layer_outputs.append(current) # --- Compositor --- compose_name = compose_spec.get("effect", "blend_multi") compose_id = f"{base_id}_comp_G{group_idx:03d}" compose_entry = registry.get("effects", {}).get(compose_name, {}) compose_config = { "effect": compose_name, "effect_path": compose_entry.get("path"), "multi_input": True, } for k, v in compose_spec.items(): if k == "effect": continue if isinstance(v, list): # List param (e.g., weights) — check each element merged_list = [] for elem_idx in range(len(v)): elem_values = [] for bi in group_indices: beat_compose = all_results[bi].get("compose", {}) beat_v = beat_compose.get(k, v) if isinstance(beat_v, list) and elem_idx < len(beat_v): elem_values.append(float(beat_v[elem_idx])) else: elem_values.append(float(v[elem_idx])) if all(ev == elem_values[0] for ev in elem_values): merged_list.append(elem_values[0]) else: track_name = f"syn_{base_id}_comp_{k}_{elem_idx}" named_analysis[track_name] = { "times": beat_times, "values": elem_values, } merged_list.append({ "_binding": True, "source": track_name, "feature": "values", "range": [0.0, 1.0], }) compose_config[k] = merged_list elif isinstance(v, (int, float)): # Scalar param — check if it varies values = [] for bi in group_indices: beat_compose = all_results[bi].get("compose", {}) values.append(float(beat_compose.get(k, v))) if all(val == values[0] for val in values): compose_config[k] = values[0] else: track_name = f"syn_{base_id}_comp_{k}" named_analysis[track_name] = { "times": beat_times, "values": values, } compose_config[k] = { "_binding": True, "source": track_name, "feature": "values", "range": [0.0, 1.0], } else: # String or other — keep as-is compose_config[k] = v expanded_nodes.append({ "id": compose_id, "type": "EFFECT", "config": compose_config, "inputs": layer_outputs, }) sequence_inputs.append(compose_id) def _parse_construct_params(params_list: list) -> tuple: """ Parse :params block in a construct definition. Syntax: ( (param_name :type string :default "value" :desc "description") ) Returns: (param_names, param_defaults) where param_names is a list of strings and param_defaults is a dict of param_name -> default_value """ param_names = [] param_defaults = {} for param_def in params_list: if not isinstance(param_def, list) or len(param_def) < 1: continue # First element is the parameter name first = param_def[0] if isinstance(first, Symbol): param_name = first.name elif isinstance(first, str): param_name = first else: continue param_names.append(param_name) # Parse keyword arguments default = None i = 1 while i < len(param_def): item = param_def[i] if isinstance(item, Keyword): if i + 1 >= len(param_def): break kw_value = param_def[i + 1] if item.name == "default": default = kw_value # We could also parse :type, :range, :choices, :desc here i += 2 else: i += 1 param_defaults[param_name] = default return param_names, param_defaults def _expand_construct( node: Dict, registry: Dict, sources: Dict[str, str], analysis_data: Dict[str, Dict], recipe_dir: Path, cluster_key: str = None, encoding: Dict = None, ) -> List[Dict]: """ Expand a user-defined CONSTRUCT node. Loads the construct definition from .sexp file, evaluates it with the provided arguments, and converts the result into segment nodes. Args: node: The CONSTRUCT node to expand registry: Recipe registry sources: Map of source names to node IDs analysis_data: Analysis results (analysis_id -> {times, values}) recipe_dir: Recipe directory for resolving paths cluster_key: Optional cluster key for hashing encoding: Encoding config Returns: List of expanded nodes (segments, effects, list) """ from .parser import parse_all, Symbol from .evaluator import evaluate config = node.get("config", {}) construct_name = config.get("construct_name") construct_path = config.get("construct_path") args = config.get("args", []) # Load construct definition full_path = recipe_dir / construct_path if not full_path.exists(): raise ValueError(f"Construct file not found: {full_path}") print(f" Loading construct: {construct_name} from {construct_path}", file=sys.stderr) construct_text = full_path.read_text() construct_sexp = parse_all(construct_text) # Parse define-construct: (define-construct name "desc" (params...) body) if not isinstance(construct_sexp, list): construct_sexp = [construct_sexp] # Process imports (effect, construct declarations) in the construct file # These extend the registry for this construct's scope local_registry = dict(registry) # Copy parent registry construct_def = None for expr in construct_sexp: if isinstance(expr, list) and expr and isinstance(expr[0], Symbol): form_name = expr[0].name if form_name == "effect": # (effect name :path "...") effect_name = expr[1].name if isinstance(expr[1], Symbol) else expr[1] # Parse kwargs i = 2 kwargs = {} while i < len(expr): if isinstance(expr[i], Keyword): kwargs[expr[i].name] = expr[i + 1] if i + 1 < len(expr) else None i += 2 else: i += 1 local_registry.setdefault("effects", {})[effect_name] = { "path": kwargs.get("path"), "cid": kwargs.get("cid"), } print(f" Construct imports effect: {effect_name}", file=sys.stderr) elif form_name == "define-construct": construct_def = expr if not construct_def: raise ValueError(f"No define-construct found in {construct_path}") # Use local_registry instead of registry from here registry = local_registry # Parse define-construct - requires :params syntax: # (define-construct name # :params ( # (param1 :type string :default "value" :desc "description") # ) # body) # # Legacy syntax (define-construct name "desc" (param1 param2) body) is not supported. def_name = construct_def[1].name if isinstance(construct_def[1], Symbol) else construct_def[1] params = [] # List of param names param_defaults = {} # param_name -> default value body = None found_params = False idx = 2 while idx < len(construct_def): item = construct_def[idx] if isinstance(item, Keyword) and item.name == "params": # :params syntax if idx + 1 >= len(construct_def): raise ValueError(f"Construct '{def_name}': Missing params list after :params keyword") params_list = construct_def[idx + 1] params, param_defaults = _parse_construct_params(params_list) found_params = True idx += 2 elif isinstance(item, Keyword): # Skip other keywords (like :desc) idx += 2 elif isinstance(item, str): # Skip description strings (but warn about legacy format) print(f" Warning: Description strings in define-construct are deprecated", file=sys.stderr) idx += 1 elif body is None: # First non-keyword, non-string item is the body if isinstance(item, list) and item: first_elem = item[0] # Check for legacy params syntax and reject it if isinstance(first_elem, Symbol) and first_elem.name not in ("let", "let*", "if", "when", "do", "begin", "->", "map", "filter", "fn", "reduce", "nth"): # Could be legacy params if all items are just symbols if all(isinstance(p, Symbol) for p in item): raise ValueError( f"Construct '{def_name}': Legacy parameter syntax (param1 param2) is not supported. " f"Use :params block instead." ) body = item idx += 1 else: idx += 1 if body is None: raise ValueError(f"No body found in define-construct {def_name}") # Build environment with sources and analysis data env = dict(sources) # Add bindings from compiler (video-a, video-b, etc.) if "bindings" in config: env.update(config["bindings"]) # Add effect names so they can be referenced as symbols for effect_name in registry.get("effects", {}): env[effect_name] = effect_name # Map analysis node IDs to their data with :times and :values for analysis_id, data in analysis_data.items(): # Find the name this analysis was bound to for name, node_id in sources.items(): if node_id == analysis_id or name.endswith("-data"): env[name] = data env[analysis_id] = data # Apply param defaults first (for :params syntax) for param_name, default_value in param_defaults.items(): if default_value is not None: env[param_name] = default_value # Bind positional args to params (overrides defaults) param_names = [p.name if isinstance(p, Symbol) else p for p in params] for i, param in enumerate(param_names): if i < len(args): arg = args[i] # Resolve node IDs to their data if it's analysis if isinstance(arg, str) and arg in analysis_data: env[param] = analysis_data[arg] else: env[param] = arg # Helper to resolve node IDs to analysis data recursively def resolve_value(val): """Resolve node IDs and symbols in a value, including inside dicts/lists.""" if isinstance(val, str) and val in analysis_data: return analysis_data[val] elif isinstance(val, str) and val in env: return env[val] elif isinstance(val, Symbol): if val.name in env: return env[val.name] return val elif isinstance(val, dict): return {k: resolve_value(v) for k, v in val.items()} elif isinstance(val, list): return [resolve_value(v) for v in val] return val # Validate and bind keyword arguments from the config (excluding internal keys) # These may be S-expressions that need evaluation (e.g., lambdas) # or Symbols that need resolution from bindings internal_keys = {"construct_name", "construct_path", "args", "bindings"} known_params = set(param_names) | set(param_defaults.keys()) for key, value in config.items(): if key not in internal_keys: # Convert key to valid identifier (replace - with _) for checking param_key = key.replace("-", "_") if param_key not in known_params: raise ValueError( f"Construct '{def_name}': Unknown parameter '{key}'. " f"Valid parameters are: {', '.join(sorted(known_params)) if known_params else '(none)'}" ) # Evaluate if it's an expression (list starting with Symbol) if isinstance(value, list) and value and isinstance(value[0], Symbol): env[param_key] = evaluate(value, env) elif isinstance(value, Symbol): # Resolve Symbol from env/bindings, then resolve any node IDs in the value if value.name in env: env[param_key] = resolve_value(env[value.name]) else: raise ValueError(f"Undefined symbol in construct arg: {value.name}") else: # Resolve node IDs inside dicts/lists env[param_key] = resolve_value(value) # Evaluate construct body print(f" Evaluating construct with params: {param_names}", file=sys.stderr) segments = evaluate(body, env) if not isinstance(segments, list): raise ValueError(f"Construct must return a list of segments, got {type(segments)}") print(f" Construct produced {len(segments)} segments", file=sys.stderr) # Convert segment descriptors to plan nodes expanded_nodes = [] sequence_inputs = [] base_id = node["id"][:8] for i, seg in enumerate(segments): if not isinstance(seg, dict): continue source_ref = seg.get("source") start = seg.get("start", 0) print(f" DEBUG segment {i}: source={str(source_ref)[:20]}... start={start}", file=sys.stderr) end = seg.get("end") duration = seg.get("duration") or (end - start if end else 1.0) effects = seg.get("effects", []) # Resolve source reference to node ID source_id = sources.get(source_ref, source_ref) if isinstance(source_ref, str) else source_ref # Create segment node segment_id = f"{base_id}_seg_{i:04d}" segment_node = { "id": segment_id, "type": "SEGMENT", "config": { "start": start, "duration": duration, }, "inputs": [source_id] if source_id else [], } expanded_nodes.append(segment_node) # Add effects if specified if effects: prev_id = segment_id for j, eff in enumerate(effects): effect_name = eff.get("effect") if isinstance(eff, dict) else eff effect_id = f"{base_id}_fx_{i:04d}_{j:02d}" # Look up effect_path from registry (prevents collapsing Python effects) effect_entry = registry.get("effects", {}).get(effect_name, {}) effect_config = { "effect": effect_name, **{k: v for k, v in (eff.items() if isinstance(eff, dict) else []) if k != "effect"}, } if effect_entry.get("path"): effect_config["effect_path"] = effect_entry["path"] effect_node = { "id": effect_id, "type": "EFFECT", "config": effect_config, "inputs": [prev_id], } expanded_nodes.append(effect_node) prev_id = effect_id sequence_inputs.append(prev_id) else: sequence_inputs.append(segment_id) # Create LIST node list_node = { "id": node["id"], "type": "LIST", "config": {}, "inputs": sequence_inputs, } expanded_nodes.append(list_node) return expanded_nodes def _expand_nodes( nodes: List[Dict], registry: Dict, recipe_dir: Path, source_paths: Dict[str, Path], work_dir: Path = None, cluster_key: str = None, on_analysis: Callable[[str, Dict], None] = None, encoding: Dict = None, pre_analysis: Dict[str, Dict] = None, ) -> List[Dict]: """ Expand dynamic nodes (SLICE_ON) by running analyzers. Processes nodes in dependency order: 1. SOURCE nodes: resolve file paths 2. SEGMENT nodes: pre-execute if needed for analysis 3. ANALYZE nodes: run analyzers (or use pre_analysis), store results 4. SLICE_ON nodes: expand using analysis results Args: nodes: List of compiled nodes registry: Recipe registry recipe_dir: Directory for resolving relative paths source_paths: Resolved source paths (id -> path) work_dir: Working directory for temporary files (created if None) cluster_key: Optional cluster key on_analysis: Callback when analysis completes (node_id, results) pre_analysis: Pre-computed analysis data (name -> results) Returns: Tuple of (expanded_nodes, named_analysis) where: - expanded_nodes: List with SLICE_ON replaced by primitives - named_analysis: Dict of analyzer_name -> {times, values} """ import tempfile nodes_by_id = {n["id"]: n for n in nodes} sorted_ids = _topological_sort(nodes) # Create work directory if needed if work_dir is None: work_dir = Path(tempfile.mkdtemp(prefix="artdag_plan_")) # Track outputs and analysis results outputs = {} # node_id -> output path or analysis data analysis_results = {} # node_id -> analysis dict named_analysis = {} # analyzer_name -> analysis dict (for effect bindings) pre_executed = set() # nodes pre-executed during planning expanded = [] expanded_ids = set() for node_id in sorted_ids: node = nodes_by_id[node_id] node_type = node["type"] if node_type == "SOURCE": # Resolve source path config = node.get("config", {}) if "path" in config: path = recipe_dir / config["path"] outputs[node_id] = path.resolve() source_paths[node_id] = outputs[node_id] expanded.append(node) expanded_ids.add(node_id) elif node_type == "SEGMENT": # Check if this segment's input is resolved inputs = node.get("inputs", []) if inputs and inputs[0] in outputs: input_path = outputs[inputs[0]] if isinstance(input_path, Path): # Skip pre-execution if config contains unresolved bindings seg_config = node.get("config", {}) has_binding = any( isinstance(v, Binding) or (isinstance(v, dict) and v.get("_binding")) for v in [seg_config.get("start"), seg_config.get("duration"), seg_config.get("end")] if v is not None ) if not has_binding: # Pre-execute segment to get output path # This is needed if ANALYZE depends on this segment import sys print(f" Pre-executing segment: {node_id[:16]}...", file=sys.stderr) output_path = _pre_execute_segment(node, input_path, work_dir) outputs[node_id] = output_path pre_executed.add(node_id) expanded.append(node) expanded_ids.add(node_id) elif node_type == "ANALYZE": # Get or run analysis config = node.get("config", {}) analysis_name = node.get("name") or config.get("analyzer") # Check for pre-computed analysis first if pre_analysis and analysis_name and analysis_name in pre_analysis: import sys print(f" Using pre-computed analysis: {analysis_name}", file=sys.stderr) results = pre_analysis[analysis_name] else: # Run analyzer to get concrete data analyzer_path = config.get("analyzer_path") node_inputs = node.get("inputs", []) if not node_inputs: raise ValueError(f"ANALYZE node {node_id} has no inputs") # Get input path - could be SOURCE or pre-executed SEGMENT input_id = node_inputs[0] input_path = outputs.get(input_id) if input_path is None: raise ValueError( f"ANALYZE input {input_id} not resolved. " "Check that input SOURCE or SEGMENT exists." ) if not isinstance(input_path, Path): raise ValueError( f"ANALYZE input {input_id} is not a file path: {type(input_path)}" ) if analyzer_path: full_path = recipe_dir / analyzer_path params = {k: v for k, v in config.items() if k not in ("analyzer", "analyzer_path", "cid")} import sys print(f" Running analyzer: {config.get('analyzer', 'unknown')}", file=sys.stderr) results = _run_analyzer(full_path, input_path, params) else: raise ValueError(f"ANALYZE node {node_id} missing analyzer_path") analysis_results[node_id] = results outputs[node_id] = results # Store by name for effect binding resolution if analysis_name: named_analysis[analysis_name] = results if on_analysis: on_analysis(node_id, results) # Keep ANALYZE node in plan (it produces a JSON artifact) expanded.append(node) expanded_ids.add(node_id) elif node_type == "SLICE_ON": # Expand into primitives using analysis results inputs = node.get("inputs", []) config = node.get("config", {}) # Lambda mode can have just 1 input (analysis), legacy needs 2 (video + analysis) has_lambda = "fn" in config if has_lambda: if len(inputs) < 1: raise ValueError(f"SLICE_ON {node_id} requires analysis input") analysis_id = inputs[0] # First input is analysis else: if len(inputs) < 2: raise ValueError(f"SLICE_ON {node_id} requires video and analysis inputs") analysis_id = inputs[1] if analysis_id not in analysis_results: raise ValueError( f"SLICE_ON {node_id} analysis input {analysis_id} not found" ) # Build sources map: name -> node_id # This lets the lambda reference videos by name sources = {} for n in nodes: if n.get("name"): sources[n["name"]] = n["id"] analysis_data = analysis_results[analysis_id] slice_nodes = _expand_slice_on(node, analysis_data, registry, sources, cluster_key, encoding, named_analysis) for sn in slice_nodes: if sn["id"] not in expanded_ids: expanded.append(sn) expanded_ids.add(sn["id"]) elif node_type == "CONSTRUCT": # Expand user-defined construct config = node.get("config", {}) construct_name = config.get("construct_name") construct_path = config.get("construct_path") if not construct_path: raise ValueError(f"CONSTRUCT {node_id} missing path") # Build sources map sources = {} for n in nodes: if n.get("name"): sources[n["name"]] = n["id"] # Get analysis data if referenced inputs = node.get("inputs", []) analysis_data = {} for inp in inputs: if inp in analysis_results: analysis_data[inp] = analysis_results[inp] construct_nodes = _expand_construct( node, registry, sources, analysis_data, recipe_dir, cluster_key, encoding ) for cn in construct_nodes: if cn["id"] not in expanded_ids: expanded.append(cn) expanded_ids.add(cn["id"]) else: # Keep other nodes as-is expanded.append(node) expanded_ids.add(node_id) return expanded, named_analysis def create_plan( recipe: CompiledRecipe, inputs: Dict[str, str] = None, recipe_dir: Path = None, cluster_key: str = None, on_analysis: Callable[[str, Dict], None] = None, pre_analysis: Dict[str, Dict] = None, ) -> ExecutionPlanSexp: """ Create an execution plan from a compiled recipe. Args: recipe: Compiled S-expression recipe inputs: Mapping of input names to content hashes recipe_dir: Directory for resolving relative paths (required for analyzers) cluster_key: Optional cluster key for cache isolation on_analysis: Callback when analysis completes (node_id, results) pre_analysis: Pre-computed analysis data (name -> results), skips running analyzers Returns: ExecutionPlanSexp with all cache IDs computed Example: >>> recipe = compile_string('(recipe "test" (-> (source cat) (effect identity)))') >>> plan = create_plan(recipe, inputs={}, recipe_dir=Path(".")) >>> print(plan.to_string()) """ inputs = inputs or {} # Compute source hash as CID (SHA256 of raw bytes) - this IS the content address source_hash = hashlib.sha256(recipe.source_text.encode('utf-8')).hexdigest() if recipe.source_text else "" # Compute params hash (use JSON + SHA256 for consistency with cache.py) if recipe.resolved_params: import json params_str = json.dumps(recipe.resolved_params, sort_keys=True, default=str) params_hash = hashlib.sha256(params_str.encode()).hexdigest() else: params_hash = "" # Check if recipe has expandable nodes (SLICE_ON, etc.) has_expandable = any(n["type"] in EXPANDABLE_TYPES for n in recipe.nodes) named_analysis = {} if has_expandable: if recipe_dir is None: raise ValueError("recipe_dir required for recipes with SLICE_ON nodes") # Expand dynamic nodes (runs analyzers, expands SLICE_ON) source_paths = {} expanded_nodes, named_analysis = _expand_nodes( recipe.nodes, recipe.registry, recipe_dir, source_paths, cluster_key=cluster_key, on_analysis=on_analysis, encoding=recipe.encoding, pre_analysis=pre_analysis, ) # Expand LIST inputs in SEQUENCE nodes expanded_nodes = _expand_list_inputs(expanded_nodes) # Collapse effect chains after expansion collapsed_nodes = _collapse_effect_chains(expanded_nodes, recipe.registry) else: # No expansion needed collapsed_nodes = _collapse_effect_chains(recipe.nodes, recipe.registry) # Build node lookup from collapsed nodes nodes_by_id = {node["id"]: node for node in collapsed_nodes} # Topological sort sorted_ids = _topological_sort(collapsed_nodes) # Create steps with resolved hashes steps = [] cache_ids = {} # step_id -> cache_id for node_id in sorted_ids: node = nodes_by_id[node_id] step = _create_step( node, recipe.registry, inputs, cache_ids, cluster_key, ) steps.append(step) cache_ids[node_id] = step.cache_id # Compute levels _compute_levels(steps, nodes_by_id) # Handle stage-aware planning if recipe has stages stage_plans = [] stage_order = [] stage_levels = {} if recipe.stages: # Build mapping from node_id to stage node_to_stage = {} for stage in recipe.stages: for node_id in stage.node_ids: node_to_stage[node_id] = stage.name # Compute stage levels (for parallel execution) stage_levels = _compute_stage_levels(recipe.stages) # Tag each step with stage info for step in steps: if step.step_id in node_to_stage: step.stage = node_to_stage[step.step_id] # Build stage plans for stage_name in recipe.stage_order: stage = next(s for s in recipe.stages if s.name == stage_name) stage_steps = [s for s in steps if s.stage == stage_name] # Build output bindings with cache IDs output_cache_ids = {} for out_name, node_id in stage.output_bindings.items(): if node_id in cache_ids: output_cache_ids[out_name] = cache_ids[node_id] stage_plans.append(StagePlan( stage_name=stage_name, steps=stage_steps, requires=stage.requires, output_bindings=output_cache_ids, level=stage_levels.get(stage_name, 0), )) stage_order = recipe.stage_order # Compute plan ID from source CID + steps plan_content = { "source_cid": source_hash, "steps": [{"id": s.step_id, "cache_id": s.cache_id} for s in steps], "inputs": inputs, } plan_id = _stable_hash(plan_content, cluster_key) return ExecutionPlanSexp( plan_id=plan_id, source_hash=source_hash, params=recipe.resolved_params, params_hash=params_hash, steps=steps, output_step_id=recipe.output_node_id, inputs=inputs, analysis=named_analysis, stage_plans=stage_plans, stage_order=stage_order, stage_levels=stage_levels, effects_registry=recipe.registry.get("effects", {}), minimal_primitives=recipe.minimal_primitives, ) def _topological_sort(nodes: List[Dict]) -> List[str]: """Sort nodes in dependency order.""" nodes_by_id = {n["id"]: n for n in nodes} visited = set() order = [] def visit(node_id: str): if node_id in visited: return visited.add(node_id) node = nodes_by_id.get(node_id) if node: for input_id in node.get("inputs", []): visit(input_id) order.append(node_id) for node in nodes: visit(node["id"]) return order def _create_step( node: Dict, registry: Dict, inputs: Dict[str, str], cache_ids: Dict[str, str], cluster_key: str = None, ) -> PlanStep: """Create a PlanStep from a node definition.""" node_id = node["id"] node_type = node["type"] config = dict(node.get("config", {})) node_inputs = node.get("inputs", []) # Resolve registry references resolved_config = _resolve_config(config, registry, inputs) # Get input cache IDs (direct graph inputs) input_cache_ids = [cache_ids[inp] for inp in node_inputs if inp in cache_ids] # Also include analysis_refs as dependencies (for binding resolution) # These are implicit inputs that affect the computation result analysis_refs = resolved_config.get("analysis_refs", []) analysis_cache_ids = [cache_ids[ref] for ref in analysis_refs if ref in cache_ids] # Compute cache ID including both inputs and analysis dependencies cache_content = { "node_type": node_type, "config": resolved_config, "inputs": sorted(input_cache_ids + analysis_cache_ids), } cache_id = _stable_hash(cache_content, cluster_key) return PlanStep( step_id=node_id, node_type=node_type, config=resolved_config, inputs=node_inputs, cache_id=cache_id, ) def _resolve_config( config: Dict, registry: Dict, inputs: Dict[str, str], ) -> Dict: """Resolve registry references in config to content hashes.""" resolved = {} for key, value in config.items(): if key == "filter_chain" and isinstance(value, list): # Resolve each filter in the chain (for COMPOUND nodes) resolved_chain = [] for filter_item in value: filter_config = filter_item.get("config", {}) resolved_filter_config = _resolve_config(filter_config, registry, inputs) resolved_chain.append({ "type": filter_item["type"], "config": resolved_filter_config, }) resolved["filter_chain"] = resolved_chain elif key == "asset" and isinstance(value, str): # Resolve asset reference - use CID from registry if value in registry.get("assets", {}): resolved["cid"] = registry["assets"][value]["cid"] else: resolved["asset"] = value # Keep as-is if not in registry elif key == "effect" and isinstance(value, str): # Resolve effect reference - keep name AND add CID/path resolved["effect"] = value if value in registry.get("effects", {}): effect_entry = registry["effects"][value] if effect_entry.get("cid"): resolved["cid"] = effect_entry["cid"] if effect_entry.get("path"): resolved["effect_path"] = effect_entry["path"] elif key == "input" and value is True: # Variable input - resolve from inputs dict input_name = config.get("name", "input") if input_name in inputs: resolved["hash"] = inputs[input_name] else: resolved["input"] = True resolved["name"] = input_name elif key == "path": # Local file path - keep as-is for local execution resolved["path"] = value else: resolved[key] = value return resolved def _compute_levels(steps: List[PlanStep], nodes_by_id: Dict) -> None: """Compute dependency levels for steps. Considers both inputs (data dependencies) and analysis_refs (binding dependencies). """ levels = {} def compute_level(step_id: str) -> int: if step_id in levels: return levels[step_id] node = nodes_by_id.get(step_id) if not node: levels[step_id] = 0 return 0 # Collect all dependencies: inputs + analysis_refs deps = list(node.get("inputs", [])) # Add analysis_refs as dependencies (for bindings to analysis data) config = node.get("config", {}) analysis_refs = config.get("analysis_refs", []) deps.extend(analysis_refs) if not deps: levels[step_id] = 0 return 0 max_dep = max(compute_level(dep) for dep in deps) levels[step_id] = max_dep + 1 return levels[step_id] for step in steps: step.level = compute_level(step.step_id) def _compute_stage_levels(stages: List) -> Dict[str, int]: """ Compute stage levels for parallel execution. Stages at the same level have no dependencies between them and can run in parallel. """ from .compiler import CompiledStage levels = {} def compute_level(stage_name: str) -> int: if stage_name in levels: return levels[stage_name] stage = next((s for s in stages if s.name == stage_name), None) if not stage or not stage.requires: levels[stage_name] = 0 return 0 max_req = max(compute_level(req) for req in stage.requires) levels[stage_name] = max_req + 1 return levels[stage_name] for stage in stages: compute_level(stage.name) return levels def step_to_task_sexp(step: PlanStep) -> List: """ Convert a step to a minimal S-expression for Celery task. This is the S-expression that gets sent to a worker. The worker hashes this to verify cache_id. """ sexp = [Symbol(step.node_type.lower())] # Add resolved config for key, value in step.config.items(): sexp.extend([Keyword(key), value]) # Add input cache IDs (not step IDs) if step.inputs: sexp.extend([Keyword("inputs"), step.inputs]) return sexp def task_cache_id(task_sexp: List, cluster_key: str = None) -> str: """ Compute cache ID from task S-expression. This allows workers to verify they're executing the right task. """ # Serialize S-expression to canonical form canonical = serialize(task_sexp) return _stable_hash({"sexp": canonical}, cluster_key)