Rename content_hash/output_hash to cid throughout

Refactor to use IPFS CID as the primary content identifier:
- Update database schema: content_hash -> cid, output_hash -> output_cid
- Update all services, routers, and tasks to use cid terminology
- Update HTML templates to display CID instead of hash
- Update cache_manager parameter names
- Update README documentation

This completes the transition to CID-only content addressing.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
gilesb
2026-01-12 08:02:44 +00:00
parent 494a2a8650
commit 92d26b2b72
22 changed files with 981 additions and 988 deletions

View File

@@ -3,7 +3,7 @@
Cache management for Art DAG L1 server.
Integrates artdag's Cache, ActivityStore, and ActivityManager to provide:
- Content-addressed caching with both node_id and content_hash
- Content-addressed caching with both node_id and cid
- Activity tracking for runs (input/output/intermediate relationships)
- Deletion rules enforcement (shared items protected)
- L2 ActivityPub integration for "shared" status checks
@@ -35,7 +35,7 @@ logger = logging.getLogger(__name__)
def file_hash(path: Path, algorithm: str = "sha3_256") -> str:
"""Compute SHA3-256 hash of a file."""
"""Compute local content hash (fallback when IPFS unavailable)."""
hasher = hashlib.new(algorithm)
actual_path = path.resolve() if path.is_symlink() else path
with open(actual_path, "rb") as f:
@@ -51,10 +51,10 @@ class CachedFile:
Provides a unified view combining:
- node_id: computation identity (for DAG caching)
- content_hash: file content identity (for external references)
- cid: file content identity (for external references)
"""
node_id: str
content_hash: str
cid: str
path: Path
size_bytes: int
node_type: str
@@ -64,7 +64,7 @@ class CachedFile:
def from_cache_entry(cls, entry: CacheEntry) -> "CachedFile":
return cls(
node_id=entry.node_id,
content_hash=entry.content_hash,
cid=entry.cid,
path=entry.output_path,
size_bytes=entry.size_bytes,
node_type=entry.node_type,
@@ -84,41 +84,41 @@ class L2SharedChecker:
self.cache_ttl = cache_ttl
self._cache: Dict[str, tuple[bool, float]] = {}
def is_shared(self, content_hash: str) -> bool:
"""Check if content_hash has been published to L2."""
def is_shared(self, cid: str) -> bool:
"""Check if cid has been published to L2."""
import time
now = time.time()
# Check cache
if content_hash in self._cache:
is_shared, cached_at = self._cache[content_hash]
if cid in self._cache:
is_shared, cached_at = self._cache[cid]
if now - cached_at < self.cache_ttl:
logger.debug(f"L2 check (cached): {content_hash[:16]}... = {is_shared}")
logger.debug(f"L2 check (cached): {cid[:16]}... = {is_shared}")
return is_shared
# Query L2
try:
url = f"{self.l2_server}/assets/by-hash/{content_hash}"
url = f"{self.l2_server}/assets/by-hash/{cid}"
logger.info(f"L2 check: GET {url}")
resp = requests.get(url, timeout=5)
logger.info(f"L2 check response: {resp.status_code}")
is_shared = resp.status_code == 200
except Exception as e:
logger.warning(f"Failed to check L2 for {content_hash}: {e}")
logger.warning(f"Failed to check L2 for {cid}: {e}")
# On error, assume IS shared (safer - prevents accidental deletion)
is_shared = True
self._cache[content_hash] = (is_shared, now)
self._cache[cid] = (is_shared, now)
return is_shared
def invalidate(self, content_hash: str):
"""Invalidate cache for a content_hash (call after publishing)."""
self._cache.pop(content_hash, None)
def invalidate(self, cid: str):
"""Invalidate cache for a cid (call after publishing)."""
self._cache.pop(cid, None)
def mark_shared(self, content_hash: str):
def mark_shared(self, cid: str):
"""Mark as shared without querying (call after successful publish)."""
import time
self._cache[content_hash] = (True, time.time())
self._cache[cid] = (True, time.time())
class L1CacheManager:
@@ -131,7 +131,7 @@ class L1CacheManager:
- ActivityManager for deletion rules
- L2 integration for shared status
Provides both node_id and content_hash based access.
Provides both node_id and cid based access.
"""
def __init__(
@@ -162,16 +162,16 @@ class L1CacheManager:
is_shared_fn=self._is_shared_by_node_id,
)
# Content hash index: content_hash -> node_id
# Content hash index: cid -> node_id
# Uses Redis if available, falls back to in-memory dict
self._content_index: Dict[str, str] = {}
self._load_content_index()
# IPFS CID index: content_hash -> ipfs_cid
# IPFS CID index: cid -> ipfs_cid
self._ipfs_cids: Dict[str, str] = {}
self._load_ipfs_index()
# Legacy files directory (for files uploaded directly by content_hash)
# Legacy files directory (for files uploaded directly by cid)
self.legacy_dir = self.cache_dir / "legacy"
self.legacy_dir.mkdir(parents=True, exist_ok=True)
@@ -179,7 +179,7 @@ class L1CacheManager:
return self.cache_dir / "content_index.json"
def _load_content_index(self):
"""Load content_hash -> node_id index from Redis or JSON file."""
"""Load cid -> node_id index from Redis or JSON file."""
# If Redis available and has data, use it
if self._redis:
try:
@@ -206,8 +206,8 @@ class L1CacheManager:
# Also index from existing cache entries
for entry in self.cache.list_entries():
if entry.content_hash:
self._content_index[entry.content_hash] = entry.node_id
if entry.cid:
self._content_index[entry.cid] = entry.node_id
# Migrate to Redis if available
if self._redis and self._content_index:
@@ -218,39 +218,39 @@ class L1CacheManager:
logger.warning(f"Failed to migrate content index to Redis: {e}")
def _save_content_index(self):
"""Save content_hash -> node_id index to Redis and JSON file."""
"""Save cid -> node_id index to Redis and JSON file."""
# Always save to JSON as backup
with open(self._index_path(), "w") as f:
json.dump(self._content_index, f, indent=2)
def _set_content_index(self, content_hash: str, node_id: str):
def _set_content_index(self, cid: str, node_id: str):
"""Set a single content index entry (Redis + in-memory)."""
self._content_index[content_hash] = node_id
self._content_index[cid] = node_id
if self._redis:
try:
self._redis.hset(self._redis_content_key, content_hash, node_id)
self._redis.hset(self._redis_content_key, cid, node_id)
except Exception as e:
logger.warning(f"Failed to set content index in Redis: {e}")
self._save_content_index()
def _get_content_index(self, content_hash: str) -> Optional[str]:
def _get_content_index(self, cid: str) -> Optional[str]:
"""Get a content index entry (Redis-first, then in-memory)."""
if self._redis:
try:
val = self._redis.hget(self._redis_content_key, content_hash)
val = self._redis.hget(self._redis_content_key, cid)
if val:
return val.decode() if isinstance(val, bytes) else val
except Exception as e:
logger.warning(f"Failed to get content index from Redis: {e}")
return self._content_index.get(content_hash)
return self._content_index.get(cid)
def _del_content_index(self, content_hash: str):
def _del_content_index(self, cid: str):
"""Delete a content index entry."""
if content_hash in self._content_index:
del self._content_index[content_hash]
if cid in self._content_index:
del self._content_index[cid]
if self._redis:
try:
self._redis.hdel(self._redis_content_key, content_hash)
self._redis.hdel(self._redis_content_key, cid)
except Exception as e:
logger.warning(f"Failed to delete content index from Redis: {e}")
self._save_content_index()
@@ -259,7 +259,7 @@ class L1CacheManager:
return self.cache_dir / "ipfs_index.json"
def _load_ipfs_index(self):
"""Load content_hash -> ipfs_cid index from Redis or JSON file."""
"""Load cid -> ipfs_cid index from Redis or JSON file."""
# If Redis available and has data, use it
if self._redis:
try:
@@ -293,71 +293,71 @@ class L1CacheManager:
logger.warning(f"Failed to migrate IPFS index to Redis: {e}")
def _save_ipfs_index(self):
"""Save content_hash -> ipfs_cid index to JSON file (backup)."""
"""Save cid -> ipfs_cid index to JSON file (backup)."""
with open(self._ipfs_index_path(), "w") as f:
json.dump(self._ipfs_cids, f, indent=2)
def _set_ipfs_index(self, content_hash: str, ipfs_cid: str):
def _set_ipfs_index(self, cid: str, ipfs_cid: str):
"""Set a single IPFS index entry (Redis + in-memory)."""
self._ipfs_cids[content_hash] = ipfs_cid
self._ipfs_cids[cid] = ipfs_cid
if self._redis:
try:
self._redis.hset(self._redis_ipfs_key, content_hash, ipfs_cid)
self._redis.hset(self._redis_ipfs_key, cid, ipfs_cid)
except Exception as e:
logger.warning(f"Failed to set IPFS index in Redis: {e}")
self._save_ipfs_index()
def _get_ipfs_cid_from_index(self, content_hash: str) -> Optional[str]:
def _get_ipfs_cid_from_index(self, cid: str) -> Optional[str]:
"""Get IPFS CID from index (Redis-first, then in-memory)."""
if self._redis:
try:
val = self._redis.hget(self._redis_ipfs_key, content_hash)
val = self._redis.hget(self._redis_ipfs_key, cid)
if val:
return val.decode() if isinstance(val, bytes) else val
except Exception as e:
logger.warning(f"Failed to get IPFS CID from Redis: {e}")
return self._ipfs_cids.get(content_hash)
return self._ipfs_cids.get(cid)
def get_ipfs_cid(self, content_hash: str) -> Optional[str]:
def get_ipfs_cid(self, cid: str) -> Optional[str]:
"""Get IPFS CID for a content hash."""
return self._get_ipfs_cid_from_index(content_hash)
return self._get_ipfs_cid_from_index(cid)
def _is_shared_by_node_id(self, content_hash: str) -> bool:
"""Check if a content_hash is shared via L2."""
return self.l2_checker.is_shared(content_hash)
def _is_shared_by_node_id(self, cid: str) -> bool:
"""Check if a cid is shared via L2."""
return self.l2_checker.is_shared(cid)
def _load_meta(self, content_hash: str) -> dict:
def _load_meta(self, cid: str) -> dict:
"""Load metadata for a cached file."""
meta_path = self.cache_dir / f"{content_hash}.meta.json"
meta_path = self.cache_dir / f"{cid}.meta.json"
if meta_path.exists():
with open(meta_path) as f:
return json.load(f)
return {}
def is_pinned(self, content_hash: str) -> tuple[bool, str]:
def is_pinned(self, cid: str) -> tuple[bool, str]:
"""
Check if a content_hash is pinned (non-deletable).
Check if a cid is pinned (non-deletable).
Returns:
(is_pinned, reason) tuple
"""
meta = self._load_meta(content_hash)
meta = self._load_meta(cid)
if meta.get("pinned"):
return True, meta.get("pin_reason", "published")
return False, ""
def _save_meta(self, content_hash: str, **updates) -> dict:
def _save_meta(self, cid: str, **updates) -> dict:
"""Save/update metadata for a cached file."""
meta = self._load_meta(content_hash)
meta = self._load_meta(cid)
meta.update(updates)
meta_path = self.cache_dir / f"{content_hash}.meta.json"
meta_path = self.cache_dir / f"{cid}.meta.json"
with open(meta_path, "w") as f:
json.dump(meta, f, indent=2)
return meta
def pin(self, content_hash: str, reason: str = "published") -> None:
def pin(self, cid: str, reason: str = "published") -> None:
"""Mark an item as pinned (non-deletable)."""
self._save_meta(content_hash, pinned=True, pin_reason=reason)
self._save_meta(cid, pinned=True, pin_reason=reason)
# ============ File Storage ============
@@ -375,31 +375,28 @@ class L1CacheManager:
Args:
source_path: Path to file to cache
node_type: Type of node (e.g., "upload", "source", "effect")
node_id: Optional node_id; if not provided, uses content_hash
node_id: Optional node_id; if not provided, uses CID
execution_time: How long the operation took
move: If True, move instead of copy
Returns:
Tuple of (CachedFile with both node_id and content_hash, IPFS CID or None)
Tuple of (CachedFile with both node_id and cid, CID)
"""
# Compute content hash first
content_hash = file_hash(source_path)
# Upload to IPFS first to get the CID (primary identifier)
cid = ipfs_client.add_file(source_path)
if not cid:
# Fallback to local hash if IPFS unavailable
cid = file_hash(source_path)
logger.warning(f"IPFS unavailable, using local hash: {cid[:16]}...")
# Use content_hash as node_id if not provided
# This is for legacy/uploaded files that don't have a DAG node
# Use CID as node_id if not provided
if node_id is None:
node_id = content_hash
node_id = cid
# Check if already cached (by node_id)
existing = self.cache.get_entry(node_id)
if existing and existing.output_path.exists():
# Already cached - still try to get IPFS CID if we don't have it
ipfs_cid = self._get_ipfs_cid_from_index(content_hash)
if not ipfs_cid:
ipfs_cid = ipfs_client.add_file(existing.output_path)
if ipfs_cid:
self._set_ipfs_index(content_hash, ipfs_cid)
return CachedFile.from_cache_entry(existing), ipfs_cid
return CachedFile.from_cache_entry(existing), cid
# Store in local cache
self.cache.put(
@@ -412,16 +409,12 @@ class L1CacheManager:
entry = self.cache.get_entry(node_id)
# Update content index (Redis + local)
self._set_content_index(entry.content_hash, node_id)
# Update content index (CID -> node_id mapping)
self._set_content_index(cid, node_id)
# Upload to IPFS (async in background would be better, but sync for now)
ipfs_cid = ipfs_client.add_file(entry.output_path)
if ipfs_cid:
self._set_ipfs_index(entry.content_hash, ipfs_cid)
logger.info(f"Uploaded to IPFS: {entry.content_hash[:16]}... -> {ipfs_cid}")
logger.info(f"Cached: {cid[:16]}...")
return CachedFile.from_cache_entry(entry), ipfs_cid
return CachedFile.from_cache_entry(entry), cid
def get_by_node_id(self, node_id: str) -> Optional[Path]:
"""Get cached file path by node_id."""
@@ -432,46 +425,46 @@ class L1CacheManager:
# CIDv0 starts with "Qm", CIDv1 starts with "bafy" or other multibase prefixes
return identifier.startswith("Qm") or identifier.startswith("bafy") or identifier.startswith("baf")
def get_by_content_hash(self, content_hash: str) -> Optional[Path]:
"""Get cached file path by content_hash or IPFS CID. Falls back to IPFS if not in local cache."""
def get_by_cid(self, cid: str) -> Optional[Path]:
"""Get cached file path by cid or IPFS CID. Falls back to IPFS if not in local cache."""
# If it looks like an IPFS CID, use get_by_cid instead
if self._is_ipfs_cid(content_hash):
return self.get_by_cid(content_hash)
if self._is_ipfs_cid(cid):
return self.get_by_cid(cid)
# Check index first (Redis then local)
node_id = self._get_content_index(content_hash)
node_id = self._get_content_index(cid)
if node_id:
path = self.cache.get(node_id)
if path and path.exists():
logger.debug(f" Found via index: {path}")
return path
# For uploads, node_id == content_hash, so try direct lookup
# For uploads, node_id == cid, so try direct lookup
# This works even if cache index hasn't been reloaded
path = self.cache.get(content_hash)
logger.debug(f" cache.get({content_hash[:16]}...) returned: {path}")
path = self.cache.get(cid)
logger.debug(f" cache.get({cid[:16]}...) returned: {path}")
if path and path.exists():
self._set_content_index(content_hash, content_hash)
self._set_content_index(cid, cid)
return path
# Scan cache entries (fallback for new structure)
entry = self.cache.find_by_content_hash(content_hash)
entry = self.cache.find_by_cid(cid)
if entry and entry.output_path.exists():
logger.debug(f" Found via scan: {entry.output_path}")
self._set_content_index(content_hash, entry.node_id)
self._set_content_index(cid, entry.node_id)
return entry.output_path
# Check legacy location (files stored directly as CACHE_DIR/{content_hash})
legacy_path = self.cache_dir / content_hash
# Check legacy location (files stored directly as CACHE_DIR/{cid})
legacy_path = self.cache_dir / cid
if legacy_path.exists() and legacy_path.is_file():
return legacy_path
# Try to recover from IPFS if we have a CID
ipfs_cid = self._get_ipfs_cid_from_index(content_hash)
ipfs_cid = self._get_ipfs_cid_from_index(cid)
if ipfs_cid:
logger.info(f"Recovering from IPFS: {content_hash[:16]}... ({ipfs_cid})")
recovery_path = self.legacy_dir / content_hash
logger.info(f"Recovering from IPFS: {cid[:16]}... ({ipfs_cid})")
recovery_path = self.legacy_dir / cid
if ipfs_client.get_file(ipfs_cid, recovery_path):
logger.info(f"Recovered from IPFS: {recovery_path}")
return recovery_path
@@ -504,16 +497,16 @@ class L1CacheManager:
return None
def has_content(self, content_hash: str) -> bool:
def has_content(self, cid: str) -> bool:
"""Check if content exists in cache."""
return self.get_by_content_hash(content_hash) is not None
return self.get_by_cid(cid) is not None
def get_entry_by_content_hash(self, content_hash: str) -> Optional[CacheEntry]:
"""Get cache entry by content_hash."""
node_id = self._get_content_index(content_hash)
def get_entry_by_cid(self, cid: str) -> Optional[CacheEntry]:
"""Get cache entry by cid."""
node_id = self._get_content_index(cid)
if node_id:
return self.cache.get_entry(node_id)
return self.cache.find_by_content_hash(content_hash)
return self.cache.find_by_cid(cid)
def list_all(self) -> List[CachedFile]:
"""List all cached files."""
@@ -523,11 +516,11 @@ class L1CacheManager:
# New cache structure entries
for entry in self.cache.list_entries():
files.append(CachedFile.from_cache_entry(entry))
if entry.content_hash:
seen_hashes.add(entry.content_hash)
if entry.cid:
seen_hashes.add(entry.cid)
# Legacy files stored directly in cache_dir (old structure)
# These are files named by content_hash directly in CACHE_DIR
# These are files named by cid directly in CACHE_DIR
for f in self.cache_dir.iterdir():
# Skip directories and special files
if not f.is_file():
@@ -544,7 +537,7 @@ class L1CacheManager:
files.append(CachedFile(
node_id=f.name,
content_hash=f.name,
cid=f.name,
path=f,
size_bytes=f.stat().st_size,
node_type="legacy",
@@ -566,8 +559,8 @@ class L1CacheManager:
"""
hashes = []
for entry in self.cache.list_entries():
if entry.node_type == node_type and entry.content_hash:
hashes.append(entry.content_hash)
if entry.node_type == node_type and entry.cid:
hashes.append(entry.cid)
return hashes
# ============ Activity Tracking ============
@@ -590,19 +583,19 @@ class L1CacheManager:
def record_simple_activity(
self,
input_hashes: List[str],
output_hash: str,
output_cid: str,
run_id: str = None,
) -> Activity:
"""
Record a simple (non-DAG) execution as an activity.
For legacy single-effect runs that don't use full DAG execution.
Uses content_hash as node_id.
Uses cid as node_id.
"""
activity = Activity(
activity_id=run_id or str(hash((tuple(input_hashes), output_hash))),
activity_id=run_id or str(hash((tuple(input_hashes), output_cid))),
input_ids=sorted(input_hashes),
output_id=output_hash,
output_id=output_cid,
intermediate_ids=[],
created_at=datetime.now(timezone.utc).timestamp(),
status="completed",
@@ -624,7 +617,7 @@ class L1CacheManager:
# ============ Deletion Rules ============
def can_delete(self, content_hash: str) -> tuple[bool, str]:
def can_delete(self, cid: str) -> tuple[bool, str]:
"""
Check if a cached item can be deleted.
@@ -632,12 +625,12 @@ class L1CacheManager:
(can_delete, reason) tuple
"""
# Check if pinned (published or input to published)
pinned, reason = self.is_pinned(content_hash)
pinned, reason = self.is_pinned(cid)
if pinned:
return False, f"Item is pinned ({reason})"
# Find node_id for this content
node_id = self._get_content_index(content_hash) or content_hash
node_id = self._get_content_index(cid) or cid
# Check if it's an input or output of any activity
for activity in self.activity_store.list():
@@ -663,34 +656,34 @@ class L1CacheManager:
for node_id in activity.all_node_ids:
entry = self.cache.get_entry(node_id)
if entry:
pinned, reason = self.is_pinned(entry.content_hash)
pinned, reason = self.is_pinned(entry.cid)
if pinned:
return False, f"Item {node_id} is pinned ({reason})"
return True, "OK"
def delete_by_content_hash(self, content_hash: str) -> tuple[bool, str]:
def delete_by_cid(self, cid: str) -> tuple[bool, str]:
"""
Delete a cached item by content_hash.
Delete a cached item by cid.
Enforces deletion rules.
Returns:
(success, message) tuple
"""
can_delete, reason = self.can_delete(content_hash)
can_delete, reason = self.can_delete(cid)
if not can_delete:
return False, reason
# Find and delete
node_id = self._get_content_index(content_hash)
node_id = self._get_content_index(cid)
if node_id:
self.cache.remove(node_id)
self._del_content_index(content_hash)
self._del_content_index(cid)
return True, "Deleted"
# Try legacy
legacy_path = self.legacy_dir / content_hash
legacy_path = self.legacy_dir / cid
if legacy_path.exists():
legacy_path.unlink()
return True, "Deleted (legacy)"
@@ -732,7 +725,7 @@ class L1CacheManager:
if activity.output_id:
entry = self.cache.get_entry(activity.output_id)
if entry:
pinned, reason = self.is_pinned(entry.content_hash)
pinned, reason = self.is_pinned(entry.cid)
if pinned:
return False, f"Output is pinned ({reason})"
@@ -743,9 +736,9 @@ class L1CacheManager:
# Remove from cache
self.cache.remove(activity.output_id)
# Remove from content index (Redis + local)
self._del_content_index(entry.content_hash)
self._del_content_index(entry.cid)
# Delete from legacy dir if exists
legacy_path = self.legacy_dir / entry.content_hash
legacy_path = self.legacy_dir / entry.cid
if legacy_path.exists():
legacy_path.unlink()
@@ -754,8 +747,8 @@ class L1CacheManager:
entry = self.cache.get_entry(node_id)
if entry:
self.cache.remove(node_id)
self._del_content_index(entry.content_hash)
legacy_path = self.legacy_dir / entry.content_hash
self._del_content_index(entry.cid)
legacy_path = self.legacy_dir / entry.cid
if legacy_path.exists():
legacy_path.unlink()
@@ -777,13 +770,13 @@ class L1CacheManager:
# ============ L2 Integration ============
def mark_published(self, content_hash: str):
"""Mark a content_hash as published to L2."""
self.l2_checker.mark_shared(content_hash)
def mark_published(self, cid: str):
"""Mark a cid as published to L2."""
self.l2_checker.mark_shared(cid)
def invalidate_shared_cache(self, content_hash: str):
def invalidate_shared_cache(self, cid: str):
"""Invalidate shared status cache (call if item might be unpublished)."""
self.l2_checker.invalidate(content_hash)
self.l2_checker.invalidate(cid)
# ============ Stats ============