Rename content_hash/output_hash to cid throughout

Refactor to use IPFS CID as the primary content identifier: - Update database schema: content_hash -> cid, output_hash -> output_cid - Update all services, routers, and tasks to use cid terminology - Update HTML templates to display CID instead of hash - Update cache_manager parameter names - Update README documentation This completes the transition to CID-only content addressing. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-12 08:02:44 +00:00
parent 494a2a8650
commit 92d26b2b72
22 changed files with 981 additions and 988 deletions
--- a/tasks/execute.py
+++ b/tasks/execute.py
@@ -81,8 +81,8 @@ def execute_step(
    # Get L1 cache manager (IPFS-backed)
    cache_mgr = get_cache_manager()

-    # Check if already cached (by cache_id as content_hash)
-    cached_path = cache_mgr.get_by_content_hash(step.cache_id)
+    # Check if already cached (by cache_id as cid)
+    cached_path = cache_mgr.get_by_cid(step.cache_id)
    if cached_path:
        logger.info(f"Step {step.step_id} already cached at {cached_path}")

@@ -141,14 +141,14 @@ def execute_step(
    try:
        # Handle SOURCE nodes
        if step.node_type == "SOURCE":
-            content_hash = step.config.get("content_hash")
-            if not content_hash:
-                raise ValueError(f"SOURCE step missing content_hash")
+            cid = step.config.get("cid")
+            if not cid:
+                raise ValueError(f"SOURCE step missing cid")

            # Look up in cache
-            path = cache_mgr.get_by_content_hash(content_hash)
+            path = cache_mgr.get_by_cid(cid)
            if not path:
-                raise ValueError(f"SOURCE input not found in cache: {content_hash[:16]}...")
+                raise ValueError(f"SOURCE input not found in cache: {cid[:16]}...")

            output_path = str(path)
            complete_task(step.cache_id, worker_id, output_path)
@@ -165,7 +165,7 @@ def execute_step(
            for item_id in step.config.get("items", []):
                item_cache_id = input_cache_ids.get(item_id)
                if item_cache_id:
-                    path = cache_mgr.get_by_content_hash(item_cache_id)
+                    path = cache_mgr.get_by_cid(item_cache_id)
                    if path:
                        item_paths.append(str(path))

@@ -190,7 +190,7 @@ def execute_step(
                input_cache_id = input_cache_ids.get(input_step_id)
                if not input_cache_id:
                    raise ValueError(f"No cache_id for input step: {input_step_id}")
-                path = cache_mgr.get_by_content_hash(input_cache_id)
+                path = cache_mgr.get_by_cid(input_cache_id)
                if not path:
                    raise ValueError(f"Input not in cache: {input_cache_id[:16]}...")
                input_paths.append(Path(path))
@@ -276,7 +276,7 @@ def execute_step(
                "step_id": step.step_id,
                "cache_id": step.cache_id,
                "output_path": str(cached_file.path),
-                "content_hash": cached_file.content_hash,
+                "cid": cached_file.cid,
                "ipfs_cid": ipfs_cid,
                "filter_count": len(filter_chain),
            }
@@ -298,7 +298,7 @@ def execute_step(
            if not input_cache_id:
                raise ValueError(f"No cache_id for input step: {input_step_id}")

-            path = cache_mgr.get_by_content_hash(input_cache_id)
+            path = cache_mgr.get_by_cid(input_cache_id)
            if not path:
                raise ValueError(f"Input not in cache: {input_cache_id[:16]}...")

@@ -336,7 +336,7 @@ def execute_step(
                    "media_type": output_def.media_type,
                    "index": output_def.index,
                    "path": str(cached_file.path),
-                    "content_hash": cached_file.content_hash,
+                    "cid": cached_file.cid,
                    "ipfs_cid": ipfs_cid,
                })
        else:
@@ -347,7 +347,7 @@ def execute_step(
                "media_type": "video/mp4",
                "index": 0,
                "path": str(cached_file.path),
-                "content_hash": cached_file.content_hash,
+                "cid": cached_file.cid,
                "ipfs_cid": ipfs_cid,
            })

@@ -362,7 +362,7 @@ def execute_step(
            "name": step.name,
            "cache_id": step.cache_id,
            "output_path": str(cached_file.path),
-            "content_hash": cached_file.content_hash,
+            "cid": cached_file.cid,
            "ipfs_cid": ipfs_cid,
            "outputs": outputs,
        }
--- a/tasks/execute_sexp.py
+++ b/tasks/execute_sexp.py
@@ -140,7 +140,7 @@ def execute_step_sexp(
    cache_mgr = get_cache_manager()

    # Check if already cached
-    cached_path = cache_mgr.get_by_content_hash(cache_id)
+    cached_path = cache_mgr.get_by_cid(cache_id)
    if cached_path:
        logger.info(f"Step {step_id} already cached at {cached_path}")

@@ -202,7 +202,7 @@ def execute_step_sexp(
            if not content_id:
                raise ValueError("SOURCE step missing :cid or :hash")

-            path = cache_mgr.get_by_content_hash(content_id)
+            path = cache_mgr.get_by_cid(content_id)
            if not path:
                raise ValueError(f"SOURCE input not found: {content_id[:16]}...")

@@ -226,7 +226,7 @@ def execute_step_sexp(
            input_paths = []
            for inp in inputs:
                inp_cache_id = input_cache_ids.get(inp, inp)
-                path = cache_mgr.get_by_content_hash(inp_cache_id)
+                path = cache_mgr.get_by_cid(inp_cache_id)
                if not path:
                    raise ValueError(f"Input not found: {inp_cache_id[:16]}...")
                input_paths.append(Path(path))
@@ -261,7 +261,7 @@ def execute_step_sexp(
            input_paths = []
            for inp in inputs:
                inp_cache_id = input_cache_ids.get(inp, inp)
-                path = cache_mgr.get_by_content_hash(inp_cache_id)
+                path = cache_mgr.get_by_cid(inp_cache_id)
                if not path:
                    raise ValueError(f"Input not found: {inp_cache_id[:16]}...")
                input_paths.append(Path(path))
@@ -366,7 +366,7 @@ def execute_step_sexp(
                "step_id": step_id,
                "cache_id": cache_id,
                "output_path": str(cached_file.path),
-                "content_hash": cached_file.content_hash,
+                "cid": cached_file.cid,
                "ipfs_cid": ipfs_cid,
                "filter_count": len(filter_chain),
            }
@@ -386,7 +386,7 @@ def execute_step_sexp(
        input_paths = []
        for inp in inputs:
            inp_cache_id = input_cache_ids.get(inp, inp)
-            path = cache_mgr.get_by_content_hash(inp_cache_id)
+            path = cache_mgr.get_by_cid(inp_cache_id)
            if not path:
                raise ValueError(f"Input not found: {inp_cache_id[:16]}...")
            input_paths.append(Path(path))
@@ -420,7 +420,7 @@ def execute_step_sexp(
            "step_id": step_id,
            "cache_id": cache_id,
            "output_path": str(cached_file.path),
-            "content_hash": cached_file.content_hash,
+            "cid": cached_file.cid,
            "ipfs_cid": ipfs_cid,
        }

--- a/tasks/orchestrate.py
+++ b/tasks/orchestrate.py
@@ -80,8 +80,8 @@ def run_plan(
        cache_ids[step.step_id] = step.cache_id

    # Also map input hashes
-    for name, content_hash in plan.input_hashes.items():
-        cache_ids[name] = content_hash
+    for name, cid in plan.input_hashes.items():
+        cache_ids[name] = cid

    # Group steps by level
    steps_by_level = plan.get_steps_by_level()
@@ -103,7 +103,7 @@ def run_plan(

        for step in level_steps:
            # Check if cached
-            cached_path = cache_mgr.get_by_content_hash(step.cache_id)
+            cached_path = cache_mgr.get_by_cid(step.cache_id)
            if cached_path:
                results_by_step[step.step_id] = {
                    "status": "cached",
@@ -171,7 +171,7 @@ def run_plan(
    output_name = plan.output_name

    if output_cache_id:
-        output_path = cache_mgr.get_by_content_hash(output_cache_id)
+        output_path = cache_mgr.get_by_cid(output_cache_id)
        output_ipfs_cid = cache_mgr.get_ipfs_cid(output_cache_id)

    # Build list of all outputs with their names and artifacts
@@ -183,7 +183,7 @@ def run_plan(
        # If no outputs in result, build from step definition
        if not step_outputs and step.outputs:
            for output_def in step.outputs:
-                output_cache_path = cache_mgr.get_by_content_hash(output_def.cache_id)
+                output_cache_path = cache_mgr.get_by_cid(output_def.cache_id)
                output_ipfs = cache_mgr.get_ipfs_cid(output_def.cache_id) if output_cache_path else None
                all_outputs.append({
                    "name": output_def.name,
@@ -318,28 +318,28 @@ def run_recipe(
        node_id = analysis_node["node_id"]

        # Resolve input reference to content hash
-        content_hash = input_hashes.get(input_ref)
-        if not content_hash:
+        cid = input_hashes.get(input_ref)
+        if not cid:
            logger.warning(f"Analysis node {node_id}: input '{input_ref}' not in input_hashes")
            continue

-        path = cache_mgr.get_by_content_hash(content_hash)
+        path = cache_mgr.get_by_cid(cid)
        if not path:
-            logger.warning(f"Analysis node {node_id}: content {content_hash[:16]}... not in cache")
+            logger.warning(f"Analysis node {node_id}: content {cid[:16]}... not in cache")
            continue

        try:
            # Run analysis for the specific feature
            features = [feature] if feature else ["beats", "energy"]
            result = analyzer.analyze(
-                input_hash=content_hash,
+                input_hash=cid,
                features=features,
                input_path=Path(path),
            )
            # Store result keyed by node_id so plan can reference it
            analysis_results[node_id] = result
-            # Also store by content_hash for compatibility
-            analysis_results[content_hash] = result
+            # Also store by cid for compatibility
+            analysis_results[cid] = result
            logger.info(f"Analysis {node_id}: feature={feature}, tempo={result.tempo}")
        except Exception as e:
            logger.warning(f"Analysis failed for {node_id}: {e}")
@@ -380,7 +380,7 @@ def run_recipe(
    # Store in cache (content-addressed, auto-pins to IPFS)
    # Plan is just another node output - no special treatment needed
    cached, plan_ipfs_cid = cache_mgr.put(tmp_path, node_type="plan", move=True)
-    logger.info(f"Plan cached: hash={cached.content_hash}, ipfs={plan_ipfs_cid}")
+    logger.info(f"Plan cached: hash={cached.cid}, ipfs={plan_ipfs_cid}")

    # Phase 4: Execute
    logger.info("Phase 4: Executing plan...")
@@ -392,7 +392,7 @@ def run_recipe(
        "run_id": run_id,
        "recipe": compiled.name,
        "plan_id": plan.plan_id,
-        "plan_cache_id": cached.content_hash,
+        "plan_cache_id": cached.cid,
        "plan_ipfs_cid": plan_ipfs_cid,
        "output_path": result.get("output_path"),
        "output_cache_id": result.get("output_cache_id"),
@@ -454,21 +454,21 @@ def generate_plan(
        feature = analysis_node["feature"]
        node_id = analysis_node["node_id"]

-        content_hash = input_hashes.get(input_ref)
-        if not content_hash:
+        cid = input_hashes.get(input_ref)
+        if not cid:
            continue

-        path = cache_mgr.get_by_content_hash(content_hash)
+        path = cache_mgr.get_by_cid(cid)
        if path:
            try:
                features = [feature] if feature else ["beats", "energy"]
                result = analyzer.analyze(
-                    input_hash=content_hash,
+                    input_hash=cid,
                    features=features,
                    input_path=Path(path),
                )
                analysis_results[node_id] = result
-                analysis_results[content_hash] = result
+                analysis_results[cid] = result
            except Exception as e:
                logger.warning(f"Analysis failed for {node_id}: {e}")

--- a/tasks/orchestrate_cid.py
+++ b/tasks/orchestrate_cid.py
@@ -67,7 +67,7 @@ def register_input_cid(
        input_path: Local path to the input file

    Returns:
-        Dict with 'cid' and 'content_hash'
+        Dict with 'cid' and 'cid'
    """
    import hashlib

@@ -77,7 +77,7 @@ def register_input_cid(

    # Compute content hash
    with open(path, "rb") as f:
-        content_hash = hashlib.sha3_256(f.read()).hexdigest()
+        cid = hashlib.sha3_256(f.read()).hexdigest()

    # Add to IPFS
    cid = ipfs_client.add_file(path)
@@ -89,7 +89,7 @@ def register_input_cid(
    return {
        "status": "completed",
        "cid": cid,
-        "content_hash": content_hash,
+        "cid": cid,
        "path": str(path),
    }

@@ -426,7 +426,7 @@ def run_from_local(
            return {"status": "failed", "phase": "register_input", "input": name, "error": result.get("error")}

        input_cids[name] = result["cid"]
-        input_hashes[name] = result["content_hash"]
+        input_hashes[name] = result["cid"]

    # Run the pipeline
    return run_recipe_cid.apply_async(