#!/usr/bin/env python3 """ Unified content cache for artdag. Design: - IPNS (cache_id) = computation hash, known BEFORE execution "What would be the result of running X with inputs Y?" - CID = content hash, known AFTER execution "What is this actual content?" Structure: .cache/ refs/ # IPNS → CID mappings (computation → result) {cache_id} # Text file containing the CID of the result content/ # Content-addressed storage {cid[:2]}/{cid} # Actual content by CID """ import hashlib import json import os from pathlib import Path from typing import Optional, Dict, Any, Tuple # Default cache location - can be overridden via ARTDAG_CACHE env var DEFAULT_CACHE_DIR = Path(__file__).parent / ".cache" def get_cache_dir() -> Path: """Get the cache directory, creating if needed.""" cache_dir = Path(os.environ.get("ARTDAG_CACHE", DEFAULT_CACHE_DIR)) cache_dir.mkdir(parents=True, exist_ok=True) return cache_dir def get_refs_dir() -> Path: """Get the refs directory (IPNS → CID mappings).""" refs_dir = get_cache_dir() / "refs" refs_dir.mkdir(parents=True, exist_ok=True) return refs_dir def get_content_dir() -> Path: """Get the content directory (CID → content).""" content_dir = get_cache_dir() / "content" content_dir.mkdir(parents=True, exist_ok=True) return content_dir # ============================================================================= # CID (Content Hash) Operations # ============================================================================= def compute_cid(content: bytes) -> str: """Compute content ID (SHA256 hash) for bytes.""" return hashlib.sha256(content).hexdigest() def compute_file_cid(file_path: Path) -> str: """Compute content ID for a file.""" with open(file_path, 'rb') as f: return compute_cid(f.read()) def compute_string_cid(text: str) -> str: """Compute content ID for a string.""" return compute_cid(text.encode('utf-8')) # ============================================================================= # Content Storage (by CID) # ============================================================================= def _content_path(cid: str) -> Path: """Get path for content by CID.""" return get_content_dir() / cid[:2] / cid def content_exists_by_cid(cid: str) -> Optional[Path]: """Check if content exists by CID.""" path = _content_path(cid) if path.exists() and path.stat().st_size > 0: return path return None def content_store_by_cid(cid: str, content: bytes) -> Path: """Store content by its CID.""" path = _content_path(cid) path.parent.mkdir(parents=True, exist_ok=True) path.write_bytes(content) return path def content_store_file(file_path: Path) -> Tuple[str, Path]: """Store a file by its content hash. Returns (cid, path).""" content = file_path.read_bytes() cid = compute_cid(content) path = content_store_by_cid(cid, content) return cid, path def content_store_string(text: str) -> Tuple[str, Path]: """Store a string by its content hash. Returns (cid, path).""" content = text.encode('utf-8') cid = compute_cid(content) path = content_store_by_cid(cid, content) return cid, path def content_get(cid: str) -> Optional[bytes]: """Get content by CID.""" path = content_exists_by_cid(cid) if path: return path.read_bytes() return None def content_get_string(cid: str) -> Optional[str]: """Get string content by CID.""" content = content_get(cid) if content: return content.decode('utf-8') return None # ============================================================================= # Refs (IPNS → CID mappings) # ============================================================================= def _ref_path(cache_id: str) -> Path: """Get path for a ref by cache_id.""" return get_refs_dir() / cache_id def ref_exists(cache_id: str) -> Optional[str]: """Check if a ref exists. Returns CID if found.""" path = _ref_path(cache_id) if path.exists(): return path.read_text().strip() return None def ref_set(cache_id: str, cid: str) -> Path: """Set a ref (IPNS → CID mapping).""" path = _ref_path(cache_id) path.parent.mkdir(parents=True, exist_ok=True) path.write_text(cid) return path def ref_get_content(cache_id: str) -> Optional[bytes]: """Get content by cache_id (looks up ref, then fetches content).""" cid = ref_exists(cache_id) if cid: return content_get(cid) return None def ref_get_string(cache_id: str) -> Optional[str]: """Get string content by cache_id.""" content = ref_get_content(cache_id) if content: return content.decode('utf-8') return None # ============================================================================= # High-level Cache Operations # ============================================================================= def cache_store(cache_id: str, content: bytes) -> Tuple[str, Path]: """ Store content with IPNS→CID indirection. Args: cache_id: Computation hash (IPNS address) content: Content to store Returns: (cid, path) tuple """ cid = compute_cid(content) path = content_store_by_cid(cid, content) ref_set(cache_id, cid) return cid, path def cache_store_file(cache_id: str, file_path: Path) -> Tuple[str, Path]: """Store a file with IPNS→CID indirection.""" content = file_path.read_bytes() return cache_store(cache_id, content) def cache_store_string(cache_id: str, text: str) -> Tuple[str, Path]: """Store a string with IPNS→CID indirection.""" return cache_store(cache_id, text.encode('utf-8')) def cache_store_json(cache_id: str, data: Any) -> Tuple[str, Path]: """Store JSON data with IPNS→CID indirection.""" text = json.dumps(data, indent=2) return cache_store_string(cache_id, text) def cache_exists(cache_id: str) -> Optional[Path]: """Check if cached content exists for a computation.""" cid = ref_exists(cache_id) if cid: return content_exists_by_cid(cid) return None def cache_get(cache_id: str) -> Optional[bytes]: """Get cached content by computation hash.""" return ref_get_content(cache_id) def cache_get_string(cache_id: str) -> Optional[str]: """Get cached string by computation hash.""" return ref_get_string(cache_id) def cache_get_json(cache_id: str) -> Optional[Any]: """Get cached JSON by computation hash.""" text = cache_get_string(cache_id) if text: return json.loads(text) return None def cache_get_path(cache_id: str) -> Optional[Path]: """Get path to cached content by computation hash.""" cid = ref_exists(cache_id) if cid: return content_exists_by_cid(cid) return None # ============================================================================= # Plan Cache (convenience wrappers) # ============================================================================= def _stable_hash_params(params: Dict[str, Any]) -> str: """Compute stable hash of params using JSON + SHA256 (consistent with CID).""" params_str = json.dumps(params, sort_keys=True, default=str) return hashlib.sha256(params_str.encode()).hexdigest() def plan_cache_id(source_cid: str, params: Dict[str, Any] = None) -> str: """ Compute the cache_id (IPNS address) for a plan. Based on source CID + params. Name/version are just metadata. """ key = f"plan:{source_cid}" if params: params_hash = _stable_hash_params(params) key = f"{key}:{params_hash}" return hashlib.sha256(key.encode()).hexdigest() def plan_exists(source_cid: str, params: Dict[str, Any] = None) -> Optional[str]: """Check if a cached plan exists. Returns CID if found.""" cache_id = plan_cache_id(source_cid, params) return ref_exists(cache_id) def plan_store(source_cid: str, params: Dict[str, Any], content: str) -> Tuple[str, str, Path]: """ Store a plan in the cache. Returns: (cache_id, cid, path) tuple """ cache_id = plan_cache_id(source_cid, params) cid, path = cache_store_string(cache_id, content) return cache_id, cid, path def plan_load(source_cid: str, params: Dict[str, Any] = None) -> Optional[str]: """Load a plan from cache. Returns plan content string.""" cache_id = plan_cache_id(source_cid, params) return cache_get_string(cache_id) def plan_get_path(source_cid: str, params: Dict[str, Any] = None) -> Optional[Path]: """Get path to cached plan.""" cache_id = plan_cache_id(source_cid, params) return cache_get_path(cache_id) # ============================================================================= # Cache Listing # ============================================================================= def list_cache(verbose: bool = False) -> Dict[str, Any]: """List all cached items.""" from datetime import datetime cache_dir = get_cache_dir() refs_dir = get_refs_dir() content_dir = get_content_dir() def format_size(size): if size >= 1_000_000_000: return f"{size / 1_000_000_000:.1f}GB" elif size >= 1_000_000: return f"{size / 1_000_000:.1f}MB" elif size >= 1000: return f"{size / 1000:.1f}KB" else: return f"{size}B" def get_file_info(path: Path) -> Dict: stat = path.stat() return { "path": path, "name": path.name, "size": stat.st_size, "size_str": format_size(stat.st_size), "mtime": datetime.fromtimestamp(stat.st_mtime), } result = { "refs": [], "content": [], "summary": {"total_items": 0, "total_size": 0}, } # Refs if refs_dir.exists(): for f in sorted(refs_dir.iterdir()): if f.is_file(): info = get_file_info(f) info["cache_id"] = f.name info["cid"] = f.read_text().strip() # Try to determine type from content cid = info["cid"] content_path = content_exists_by_cid(cid) if content_path: info["content_size"] = content_path.stat().st_size info["content_size_str"] = format_size(info["content_size"]) result["refs"].append(info) # Content if content_dir.exists(): for subdir in sorted(content_dir.iterdir()): if subdir.is_dir(): for f in sorted(subdir.iterdir()): if f.is_file(): info = get_file_info(f) info["cid"] = f.name result["content"].append(info) # Summary result["summary"]["total_refs"] = len(result["refs"]) result["summary"]["total_content"] = len(result["content"]) result["summary"]["total_size"] = sum(i["size"] for i in result["content"]) result["summary"]["total_size_str"] = format_size(result["summary"]["total_size"]) return result def print_cache_listing(verbose: bool = False): """Print cache listing to stdout.""" info = list_cache(verbose) cache_dir = get_cache_dir() print(f"\nCache directory: {cache_dir}\n") # Refs summary if info["refs"]: print(f"=== Refs ({len(info['refs'])}) ===") for ref in info["refs"][:20]: # Show first 20 content_info = f" → {ref.get('content_size_str', '?')}" if 'content_size_str' in ref else "" print(f" {ref['cache_id'][:16]}... → {ref['cid'][:16]}...{content_info}") if len(info["refs"]) > 20: print(f" ... and {len(info['refs']) - 20} more") print() # Content by type if info["content"]: # Group by first 2 chars (subdirectory) print(f"=== Content ({len(info['content'])} items, {info['summary']['total_size_str']}) ===") for item in info["content"][:20]: print(f" {item['cid'][:16]}... {item['size_str']:>8} {item['mtime'].strftime('%Y-%m-%d %H:%M')}") if len(info["content"]) > 20: print(f" ... and {len(info['content']) - 20} more") print() print(f"=== Summary ===") print(f" Refs: {info['summary']['total_refs']}") print(f" Content: {info['summary']['total_content']} ({info['summary']['total_size_str']})") if verbose: print(f"\nTo clear cache: rm -rf {cache_dir}/*") if __name__ == "__main__": import sys verbose = "-v" in sys.argv or "--verbose" in sys.argv print_cache_listing(verbose)