Rename content_hash/output_hash to cid throughout

Refactor to use IPFS CID as the primary content identifier:
- Update database schema: content_hash -> cid, output_hash -> output_cid
- Update all services, routers, and tasks to use cid terminology
- Update HTML templates to display CID instead of hash
- Update cache_manager parameter names
- Update README documentation

This completes the transition to CID-only content addressing.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
gilesb
2026-01-12 08:02:44 +00:00
parent 494a2a8650
commit 92d26b2b72
22 changed files with 981 additions and 988 deletions

View File

@@ -81,8 +81,8 @@ def execute_step(
# Get L1 cache manager (IPFS-backed)
cache_mgr = get_cache_manager()
# Check if already cached (by cache_id as content_hash)
cached_path = cache_mgr.get_by_content_hash(step.cache_id)
# Check if already cached (by cache_id as cid)
cached_path = cache_mgr.get_by_cid(step.cache_id)
if cached_path:
logger.info(f"Step {step.step_id} already cached at {cached_path}")
@@ -141,14 +141,14 @@ def execute_step(
try:
# Handle SOURCE nodes
if step.node_type == "SOURCE":
content_hash = step.config.get("content_hash")
if not content_hash:
raise ValueError(f"SOURCE step missing content_hash")
cid = step.config.get("cid")
if not cid:
raise ValueError(f"SOURCE step missing cid")
# Look up in cache
path = cache_mgr.get_by_content_hash(content_hash)
path = cache_mgr.get_by_cid(cid)
if not path:
raise ValueError(f"SOURCE input not found in cache: {content_hash[:16]}...")
raise ValueError(f"SOURCE input not found in cache: {cid[:16]}...")
output_path = str(path)
complete_task(step.cache_id, worker_id, output_path)
@@ -165,7 +165,7 @@ def execute_step(
for item_id in step.config.get("items", []):
item_cache_id = input_cache_ids.get(item_id)
if item_cache_id:
path = cache_mgr.get_by_content_hash(item_cache_id)
path = cache_mgr.get_by_cid(item_cache_id)
if path:
item_paths.append(str(path))
@@ -190,7 +190,7 @@ def execute_step(
input_cache_id = input_cache_ids.get(input_step_id)
if not input_cache_id:
raise ValueError(f"No cache_id for input step: {input_step_id}")
path = cache_mgr.get_by_content_hash(input_cache_id)
path = cache_mgr.get_by_cid(input_cache_id)
if not path:
raise ValueError(f"Input not in cache: {input_cache_id[:16]}...")
input_paths.append(Path(path))
@@ -276,7 +276,7 @@ def execute_step(
"step_id": step.step_id,
"cache_id": step.cache_id,
"output_path": str(cached_file.path),
"content_hash": cached_file.content_hash,
"cid": cached_file.cid,
"ipfs_cid": ipfs_cid,
"filter_count": len(filter_chain),
}
@@ -298,7 +298,7 @@ def execute_step(
if not input_cache_id:
raise ValueError(f"No cache_id for input step: {input_step_id}")
path = cache_mgr.get_by_content_hash(input_cache_id)
path = cache_mgr.get_by_cid(input_cache_id)
if not path:
raise ValueError(f"Input not in cache: {input_cache_id[:16]}...")
@@ -336,7 +336,7 @@ def execute_step(
"media_type": output_def.media_type,
"index": output_def.index,
"path": str(cached_file.path),
"content_hash": cached_file.content_hash,
"cid": cached_file.cid,
"ipfs_cid": ipfs_cid,
})
else:
@@ -347,7 +347,7 @@ def execute_step(
"media_type": "video/mp4",
"index": 0,
"path": str(cached_file.path),
"content_hash": cached_file.content_hash,
"cid": cached_file.cid,
"ipfs_cid": ipfs_cid,
})
@@ -362,7 +362,7 @@ def execute_step(
"name": step.name,
"cache_id": step.cache_id,
"output_path": str(cached_file.path),
"content_hash": cached_file.content_hash,
"cid": cached_file.cid,
"ipfs_cid": ipfs_cid,
"outputs": outputs,
}

View File

@@ -140,7 +140,7 @@ def execute_step_sexp(
cache_mgr = get_cache_manager()
# Check if already cached
cached_path = cache_mgr.get_by_content_hash(cache_id)
cached_path = cache_mgr.get_by_cid(cache_id)
if cached_path:
logger.info(f"Step {step_id} already cached at {cached_path}")
@@ -202,7 +202,7 @@ def execute_step_sexp(
if not content_id:
raise ValueError("SOURCE step missing :cid or :hash")
path = cache_mgr.get_by_content_hash(content_id)
path = cache_mgr.get_by_cid(content_id)
if not path:
raise ValueError(f"SOURCE input not found: {content_id[:16]}...")
@@ -226,7 +226,7 @@ def execute_step_sexp(
input_paths = []
for inp in inputs:
inp_cache_id = input_cache_ids.get(inp, inp)
path = cache_mgr.get_by_content_hash(inp_cache_id)
path = cache_mgr.get_by_cid(inp_cache_id)
if not path:
raise ValueError(f"Input not found: {inp_cache_id[:16]}...")
input_paths.append(Path(path))
@@ -261,7 +261,7 @@ def execute_step_sexp(
input_paths = []
for inp in inputs:
inp_cache_id = input_cache_ids.get(inp, inp)
path = cache_mgr.get_by_content_hash(inp_cache_id)
path = cache_mgr.get_by_cid(inp_cache_id)
if not path:
raise ValueError(f"Input not found: {inp_cache_id[:16]}...")
input_paths.append(Path(path))
@@ -366,7 +366,7 @@ def execute_step_sexp(
"step_id": step_id,
"cache_id": cache_id,
"output_path": str(cached_file.path),
"content_hash": cached_file.content_hash,
"cid": cached_file.cid,
"ipfs_cid": ipfs_cid,
"filter_count": len(filter_chain),
}
@@ -386,7 +386,7 @@ def execute_step_sexp(
input_paths = []
for inp in inputs:
inp_cache_id = input_cache_ids.get(inp, inp)
path = cache_mgr.get_by_content_hash(inp_cache_id)
path = cache_mgr.get_by_cid(inp_cache_id)
if not path:
raise ValueError(f"Input not found: {inp_cache_id[:16]}...")
input_paths.append(Path(path))
@@ -420,7 +420,7 @@ def execute_step_sexp(
"step_id": step_id,
"cache_id": cache_id,
"output_path": str(cached_file.path),
"content_hash": cached_file.content_hash,
"cid": cached_file.cid,
"ipfs_cid": ipfs_cid,
}

View File

@@ -80,8 +80,8 @@ def run_plan(
cache_ids[step.step_id] = step.cache_id
# Also map input hashes
for name, content_hash in plan.input_hashes.items():
cache_ids[name] = content_hash
for name, cid in plan.input_hashes.items():
cache_ids[name] = cid
# Group steps by level
steps_by_level = plan.get_steps_by_level()
@@ -103,7 +103,7 @@ def run_plan(
for step in level_steps:
# Check if cached
cached_path = cache_mgr.get_by_content_hash(step.cache_id)
cached_path = cache_mgr.get_by_cid(step.cache_id)
if cached_path:
results_by_step[step.step_id] = {
"status": "cached",
@@ -171,7 +171,7 @@ def run_plan(
output_name = plan.output_name
if output_cache_id:
output_path = cache_mgr.get_by_content_hash(output_cache_id)
output_path = cache_mgr.get_by_cid(output_cache_id)
output_ipfs_cid = cache_mgr.get_ipfs_cid(output_cache_id)
# Build list of all outputs with their names and artifacts
@@ -183,7 +183,7 @@ def run_plan(
# If no outputs in result, build from step definition
if not step_outputs and step.outputs:
for output_def in step.outputs:
output_cache_path = cache_mgr.get_by_content_hash(output_def.cache_id)
output_cache_path = cache_mgr.get_by_cid(output_def.cache_id)
output_ipfs = cache_mgr.get_ipfs_cid(output_def.cache_id) if output_cache_path else None
all_outputs.append({
"name": output_def.name,
@@ -318,28 +318,28 @@ def run_recipe(
node_id = analysis_node["node_id"]
# Resolve input reference to content hash
content_hash = input_hashes.get(input_ref)
if not content_hash:
cid = input_hashes.get(input_ref)
if not cid:
logger.warning(f"Analysis node {node_id}: input '{input_ref}' not in input_hashes")
continue
path = cache_mgr.get_by_content_hash(content_hash)
path = cache_mgr.get_by_cid(cid)
if not path:
logger.warning(f"Analysis node {node_id}: content {content_hash[:16]}... not in cache")
logger.warning(f"Analysis node {node_id}: content {cid[:16]}... not in cache")
continue
try:
# Run analysis for the specific feature
features = [feature] if feature else ["beats", "energy"]
result = analyzer.analyze(
input_hash=content_hash,
input_hash=cid,
features=features,
input_path=Path(path),
)
# Store result keyed by node_id so plan can reference it
analysis_results[node_id] = result
# Also store by content_hash for compatibility
analysis_results[content_hash] = result
# Also store by cid for compatibility
analysis_results[cid] = result
logger.info(f"Analysis {node_id}: feature={feature}, tempo={result.tempo}")
except Exception as e:
logger.warning(f"Analysis failed for {node_id}: {e}")
@@ -380,7 +380,7 @@ def run_recipe(
# Store in cache (content-addressed, auto-pins to IPFS)
# Plan is just another node output - no special treatment needed
cached, plan_ipfs_cid = cache_mgr.put(tmp_path, node_type="plan", move=True)
logger.info(f"Plan cached: hash={cached.content_hash}, ipfs={plan_ipfs_cid}")
logger.info(f"Plan cached: hash={cached.cid}, ipfs={plan_ipfs_cid}")
# Phase 4: Execute
logger.info("Phase 4: Executing plan...")
@@ -392,7 +392,7 @@ def run_recipe(
"run_id": run_id,
"recipe": compiled.name,
"plan_id": plan.plan_id,
"plan_cache_id": cached.content_hash,
"plan_cache_id": cached.cid,
"plan_ipfs_cid": plan_ipfs_cid,
"output_path": result.get("output_path"),
"output_cache_id": result.get("output_cache_id"),
@@ -454,21 +454,21 @@ def generate_plan(
feature = analysis_node["feature"]
node_id = analysis_node["node_id"]
content_hash = input_hashes.get(input_ref)
if not content_hash:
cid = input_hashes.get(input_ref)
if not cid:
continue
path = cache_mgr.get_by_content_hash(content_hash)
path = cache_mgr.get_by_cid(cid)
if path:
try:
features = [feature] if feature else ["beats", "energy"]
result = analyzer.analyze(
input_hash=content_hash,
input_hash=cid,
features=features,
input_path=Path(path),
)
analysis_results[node_id] = result
analysis_results[content_hash] = result
analysis_results[cid] = result
except Exception as e:
logger.warning(f"Analysis failed for {node_id}: {e}")

View File

@@ -67,7 +67,7 @@ def register_input_cid(
input_path: Local path to the input file
Returns:
Dict with 'cid' and 'content_hash'
Dict with 'cid' and 'cid'
"""
import hashlib
@@ -77,7 +77,7 @@ def register_input_cid(
# Compute content hash
with open(path, "rb") as f:
content_hash = hashlib.sha3_256(f.read()).hexdigest()
cid = hashlib.sha3_256(f.read()).hexdigest()
# Add to IPFS
cid = ipfs_client.add_file(path)
@@ -89,7 +89,7 @@ def register_input_cid(
return {
"status": "completed",
"cid": cid,
"content_hash": content_hash,
"cid": cid,
"path": str(path),
}
@@ -426,7 +426,7 @@ def run_from_local(
return {"status": "failed", "phase": "register_input", "input": name, "error": result.get("error")}
input_cids[name] = result["cid"]
input_hashes[name] = result["content_hash"]
input_hashes[name] = result["cid"]
# Run the pipeline
return run_recipe_cid.apply_async(