465 lines
14 KiB
Python
465 lines
14 KiB
Python
# primitive/cache.py
|
|
"""
|
|
Content-addressed file cache for node outputs.
|
|
|
|
Each node's output is stored at: cache_dir / node_id / output_file
|
|
This enables automatic reuse when the same operation is requested.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import shutil
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _file_hash(path: Path, algorithm: str = "sha3_256") -> str:
|
|
"""
|
|
Compute content hash of a file.
|
|
|
|
Uses SHA-3 (Keccak) by default for quantum resistance.
|
|
"""
|
|
import hashlib
|
|
hasher = hashlib.new(algorithm)
|
|
with open(path, "rb") as f:
|
|
for chunk in iter(lambda: f.read(65536), b""):
|
|
hasher.update(chunk)
|
|
return hasher.hexdigest()
|
|
|
|
|
|
@dataclass
|
|
class CacheEntry:
|
|
"""Metadata about a cached output."""
|
|
node_id: str
|
|
output_path: Path
|
|
created_at: float
|
|
size_bytes: int
|
|
node_type: str
|
|
cid: str = "" # Content identifier (IPFS CID or local hash)
|
|
execution_time: float = 0.0
|
|
|
|
def to_dict(self) -> Dict:
|
|
return {
|
|
"node_id": self.node_id,
|
|
"output_path": str(self.output_path),
|
|
"created_at": self.created_at,
|
|
"size_bytes": self.size_bytes,
|
|
"node_type": self.node_type,
|
|
"cid": self.cid,
|
|
"execution_time": self.execution_time,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict) -> "CacheEntry":
|
|
# Support both "cid" and legacy "content_hash"
|
|
cid = data.get("cid") or data.get("content_hash", "")
|
|
return cls(
|
|
node_id=data["node_id"],
|
|
output_path=Path(data["output_path"]),
|
|
created_at=data["created_at"],
|
|
size_bytes=data["size_bytes"],
|
|
node_type=data["node_type"],
|
|
cid=cid,
|
|
execution_time=data.get("execution_time", 0.0),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class CacheStats:
|
|
"""Statistics about cache usage."""
|
|
total_entries: int = 0
|
|
total_size_bytes: int = 0
|
|
hits: int = 0
|
|
misses: int = 0
|
|
hit_rate: float = 0.0
|
|
|
|
def record_hit(self):
|
|
self.hits += 1
|
|
self._update_rate()
|
|
|
|
def record_miss(self):
|
|
self.misses += 1
|
|
self._update_rate()
|
|
|
|
def _update_rate(self):
|
|
total = self.hits + self.misses
|
|
self.hit_rate = self.hits / total if total > 0 else 0.0
|
|
|
|
|
|
class Cache:
|
|
"""
|
|
Code-addressed file cache.
|
|
|
|
The filesystem IS the index - no JSON index files needed.
|
|
Each node's hash is its directory name.
|
|
|
|
Structure:
|
|
cache_dir/
|
|
<hash>/
|
|
output.ext # Actual output file
|
|
metadata.json # Per-node metadata (optional)
|
|
"""
|
|
|
|
def __init__(self, cache_dir: Path | str):
|
|
self.cache_dir = Path(cache_dir)
|
|
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
self.stats = CacheStats()
|
|
|
|
def _node_dir(self, node_id: str) -> Path:
|
|
"""Get the cache directory for a node."""
|
|
return self.cache_dir / node_id
|
|
|
|
def _find_output_file(self, node_dir: Path) -> Optional[Path]:
|
|
"""Find the output file in a node directory."""
|
|
if not node_dir.exists() or not node_dir.is_dir():
|
|
return None
|
|
for f in node_dir.iterdir():
|
|
if f.is_file() and f.name.startswith("output."):
|
|
return f
|
|
return None
|
|
|
|
def get(self, node_id: str) -> Optional[Path]:
|
|
"""
|
|
Get cached output path for a node.
|
|
|
|
Checks filesystem directly - no in-memory index.
|
|
Returns the output path if cached, None otherwise.
|
|
"""
|
|
node_dir = self._node_dir(node_id)
|
|
output_file = self._find_output_file(node_dir)
|
|
|
|
if output_file:
|
|
self.stats.record_hit()
|
|
logger.debug(f"Cache hit: {node_id[:16]}...")
|
|
return output_file
|
|
|
|
self.stats.record_miss()
|
|
return None
|
|
|
|
def put(self, node_id: str, source_path: Path, node_type: str,
|
|
execution_time: float = 0.0, move: bool = False) -> Path:
|
|
"""
|
|
Store a file in the cache.
|
|
|
|
Args:
|
|
node_id: The code-addressed node ID (hash)
|
|
source_path: Path to the file to cache
|
|
node_type: Type of the node (for metadata)
|
|
execution_time: How long the node took to execute
|
|
move: If True, move the file instead of copying
|
|
|
|
Returns:
|
|
Path to the cached file
|
|
"""
|
|
node_dir = self._node_dir(node_id)
|
|
node_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Preserve extension
|
|
ext = source_path.suffix or ".out"
|
|
output_path = node_dir / f"output{ext}"
|
|
|
|
# Copy or move file (skip if already in place)
|
|
source_resolved = Path(source_path).resolve()
|
|
output_resolved = output_path.resolve()
|
|
if source_resolved != output_resolved:
|
|
if move:
|
|
shutil.move(source_path, output_path)
|
|
else:
|
|
shutil.copy2(source_path, output_path)
|
|
|
|
# Compute content hash (IPFS CID of the result)
|
|
cid = _file_hash(output_path)
|
|
|
|
# Store per-node metadata (optional, for stats/debugging)
|
|
metadata = {
|
|
"node_id": node_id,
|
|
"output_path": str(output_path),
|
|
"created_at": time.time(),
|
|
"size_bytes": output_path.stat().st_size,
|
|
"node_type": node_type,
|
|
"cid": cid,
|
|
"execution_time": execution_time,
|
|
}
|
|
metadata_path = node_dir / "metadata.json"
|
|
with open(metadata_path, "w") as f:
|
|
json.dump(metadata, f, indent=2)
|
|
|
|
logger.debug(f"Cached: {node_id[:16]}... ({metadata['size_bytes']} bytes)")
|
|
return output_path
|
|
|
|
def has(self, node_id: str) -> bool:
|
|
"""Check if a node is cached (without affecting stats)."""
|
|
return self._find_output_file(self._node_dir(node_id)) is not None
|
|
|
|
def remove(self, node_id: str) -> bool:
|
|
"""Remove a node from the cache."""
|
|
node_dir = self._node_dir(node_id)
|
|
if node_dir.exists():
|
|
shutil.rmtree(node_dir)
|
|
return True
|
|
return False
|
|
|
|
def clear(self):
|
|
"""Clear all cached entries."""
|
|
for node_dir in self.cache_dir.iterdir():
|
|
if node_dir.is_dir() and not node_dir.name.startswith("_"):
|
|
shutil.rmtree(node_dir)
|
|
self.stats = CacheStats()
|
|
|
|
def get_stats(self) -> CacheStats:
|
|
"""Get cache statistics (scans filesystem)."""
|
|
stats = CacheStats()
|
|
for node_dir in self.cache_dir.iterdir():
|
|
if node_dir.is_dir() and not node_dir.name.startswith("_"):
|
|
output_file = self._find_output_file(node_dir)
|
|
if output_file:
|
|
stats.total_entries += 1
|
|
stats.total_size_bytes += output_file.stat().st_size
|
|
stats.hits = self.stats.hits
|
|
stats.misses = self.stats.misses
|
|
stats.hit_rate = self.stats.hit_rate
|
|
return stats
|
|
|
|
def list_entries(self) -> List[CacheEntry]:
|
|
"""List all cache entries (scans filesystem)."""
|
|
entries = []
|
|
for node_dir in self.cache_dir.iterdir():
|
|
if node_dir.is_dir() and not node_dir.name.startswith("_"):
|
|
entry = self._load_entry_from_disk(node_dir.name)
|
|
if entry:
|
|
entries.append(entry)
|
|
return entries
|
|
|
|
def _load_entry_from_disk(self, node_id: str) -> Optional[CacheEntry]:
|
|
"""Load entry metadata from disk."""
|
|
node_dir = self._node_dir(node_id)
|
|
metadata_path = node_dir / "metadata.json"
|
|
output_file = self._find_output_file(node_dir)
|
|
|
|
if not output_file:
|
|
return None
|
|
|
|
if metadata_path.exists():
|
|
try:
|
|
with open(metadata_path) as f:
|
|
data = json.load(f)
|
|
return CacheEntry.from_dict(data)
|
|
except (json.JSONDecodeError, KeyError):
|
|
pass
|
|
|
|
# Fallback: create entry from filesystem
|
|
return CacheEntry(
|
|
node_id=node_id,
|
|
output_path=output_file,
|
|
created_at=output_file.stat().st_mtime,
|
|
size_bytes=output_file.stat().st_size,
|
|
node_type="unknown",
|
|
cid=_file_hash(output_file),
|
|
)
|
|
|
|
def get_entry(self, node_id: str) -> Optional[CacheEntry]:
|
|
"""Get cache entry metadata (without affecting stats)."""
|
|
return self._load_entry_from_disk(node_id)
|
|
|
|
def find_by_cid(self, cid: str) -> Optional[CacheEntry]:
|
|
"""Find a cache entry by its content hash (scans filesystem)."""
|
|
for entry in self.list_entries():
|
|
if entry.cid == cid:
|
|
return entry
|
|
return None
|
|
|
|
def prune(self, max_size_bytes: int = None, max_age_seconds: float = None) -> int:
|
|
"""
|
|
Prune cache based on size or age.
|
|
|
|
Args:
|
|
max_size_bytes: Remove oldest entries until under this size
|
|
max_age_seconds: Remove entries older than this
|
|
|
|
Returns:
|
|
Number of entries removed
|
|
"""
|
|
removed = 0
|
|
now = time.time()
|
|
entries = self.list_entries()
|
|
|
|
# Remove by age first
|
|
if max_age_seconds is not None:
|
|
for entry in entries:
|
|
if now - entry.created_at > max_age_seconds:
|
|
self.remove(entry.node_id)
|
|
removed += 1
|
|
|
|
# Then by size (remove oldest first)
|
|
if max_size_bytes is not None:
|
|
stats = self.get_stats()
|
|
if stats.total_size_bytes > max_size_bytes:
|
|
sorted_entries = sorted(entries, key=lambda e: e.created_at)
|
|
total_size = stats.total_size_bytes
|
|
for entry in sorted_entries:
|
|
if total_size <= max_size_bytes:
|
|
break
|
|
self.remove(entry.node_id)
|
|
total_size -= entry.size_bytes
|
|
removed += 1
|
|
|
|
return removed
|
|
|
|
def get_output_path(self, node_id: str, extension: str = ".mkv") -> Path:
|
|
"""Get the output path for a node (creates directory if needed)."""
|
|
node_dir = self._node_dir(node_id)
|
|
node_dir.mkdir(parents=True, exist_ok=True)
|
|
return node_dir / f"output{extension}"
|
|
|
|
# Effect storage methods
|
|
|
|
def _effects_dir(self) -> Path:
|
|
"""Get the effects subdirectory."""
|
|
effects_dir = self.cache_dir / "_effects"
|
|
effects_dir.mkdir(parents=True, exist_ok=True)
|
|
return effects_dir
|
|
|
|
def store_effect(self, source: str) -> str:
|
|
"""
|
|
Store an effect in the cache.
|
|
|
|
Args:
|
|
source: Effect source code
|
|
|
|
Returns:
|
|
Content hash (cache ID) of the effect
|
|
"""
|
|
import hashlib as _hashlib
|
|
|
|
# Compute content hash
|
|
cid = _hashlib.sha3_256(source.encode("utf-8")).hexdigest()
|
|
|
|
# Try to load full metadata if effects module available
|
|
try:
|
|
from .effects.loader import load_effect
|
|
loaded = load_effect(source)
|
|
meta_dict = loaded.meta.to_dict()
|
|
dependencies = loaded.dependencies
|
|
requires_python = loaded.requires_python
|
|
except ImportError:
|
|
# Fallback: store without parsed metadata
|
|
meta_dict = {}
|
|
dependencies = []
|
|
requires_python = ">=3.10"
|
|
|
|
effect_dir = self._effects_dir() / cid
|
|
effect_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Store source
|
|
source_path = effect_dir / "effect.py"
|
|
source_path.write_text(source, encoding="utf-8")
|
|
|
|
# Store metadata
|
|
metadata = {
|
|
"cid": cid,
|
|
"meta": meta_dict,
|
|
"dependencies": dependencies,
|
|
"requires_python": requires_python,
|
|
"stored_at": time.time(),
|
|
}
|
|
metadata_path = effect_dir / "metadata.json"
|
|
with open(metadata_path, "w") as f:
|
|
json.dump(metadata, f, indent=2)
|
|
|
|
logger.info(f"Stored effect '{loaded.meta.name}' with hash {cid[:16]}...")
|
|
return cid
|
|
|
|
def get_effect(self, cid: str) -> Optional[str]:
|
|
"""
|
|
Get effect source by content hash.
|
|
|
|
Args:
|
|
cid: SHA3-256 hash of effect source
|
|
|
|
Returns:
|
|
Effect source code if found, None otherwise
|
|
"""
|
|
effect_dir = self._effects_dir() / cid
|
|
source_path = effect_dir / "effect.py"
|
|
|
|
if not source_path.exists():
|
|
return None
|
|
|
|
return source_path.read_text(encoding="utf-8")
|
|
|
|
def get_effect_path(self, cid: str) -> Optional[Path]:
|
|
"""
|
|
Get path to effect source file.
|
|
|
|
Args:
|
|
cid: SHA3-256 hash of effect source
|
|
|
|
Returns:
|
|
Path to effect.py if found, None otherwise
|
|
"""
|
|
effect_dir = self._effects_dir() / cid
|
|
source_path = effect_dir / "effect.py"
|
|
|
|
if not source_path.exists():
|
|
return None
|
|
|
|
return source_path
|
|
|
|
def get_effect_metadata(self, cid: str) -> Optional[dict]:
|
|
"""
|
|
Get effect metadata by content hash.
|
|
|
|
Args:
|
|
cid: SHA3-256 hash of effect source
|
|
|
|
Returns:
|
|
Metadata dict if found, None otherwise
|
|
"""
|
|
effect_dir = self._effects_dir() / cid
|
|
metadata_path = effect_dir / "metadata.json"
|
|
|
|
if not metadata_path.exists():
|
|
return None
|
|
|
|
try:
|
|
with open(metadata_path) as f:
|
|
return json.load(f)
|
|
except (json.JSONDecodeError, KeyError):
|
|
return None
|
|
|
|
def has_effect(self, cid: str) -> bool:
|
|
"""Check if an effect is cached."""
|
|
effect_dir = self._effects_dir() / cid
|
|
return (effect_dir / "effect.py").exists()
|
|
|
|
def list_effects(self) -> List[dict]:
|
|
"""List all cached effects with their metadata."""
|
|
effects = []
|
|
effects_dir = self._effects_dir()
|
|
|
|
if not effects_dir.exists():
|
|
return effects
|
|
|
|
for effect_dir in effects_dir.iterdir():
|
|
if effect_dir.is_dir():
|
|
metadata = self.get_effect_metadata(effect_dir.name)
|
|
if metadata:
|
|
effects.append(metadata)
|
|
|
|
return effects
|
|
|
|
def remove_effect(self, cid: str) -> bool:
|
|
"""Remove an effect from the cache."""
|
|
effect_dir = self._effects_dir() / cid
|
|
|
|
if not effect_dir.exists():
|
|
return False
|
|
|
|
shutil.rmtree(effect_dir)
|
|
logger.info(f"Removed effect {cid[:16]}...")
|
|
return True
|