rose-ash/artdag/core/artdag/cache.py

# primitive/cache.py
"""
Content-addressed file cache for node outputs.

Each node's output is stored at: cache_dir / node_id / output_file
This enables automatic reuse when the same operation is requested.
"""

import json
import logging
import shutil
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional

logger = logging.getLogger(__name__)


def _file_hash(path: Path, algorithm: str = "sha3_256") -> str:
    """
    Compute content hash of a file.

    Uses SHA-3 (Keccak) by default for quantum resistance.
    """
    import hashlib
    hasher = hashlib.new(algorithm)
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(65536), b""):
            hasher.update(chunk)
    return hasher.hexdigest()


@dataclass
class CacheEntry:
    """Metadata about a cached output."""
    node_id: str
    output_path: Path
    created_at: float
    size_bytes: int
    node_type: str
    cid: str = ""  # Content identifier (IPFS CID or local hash)
    execution_time: float = 0.0

    def to_dict(self) -> Dict:
        return {
            "node_id": self.node_id,
            "output_path": str(self.output_path),
            "created_at": self.created_at,
            "size_bytes": self.size_bytes,
            "node_type": self.node_type,
            "cid": self.cid,
            "execution_time": self.execution_time,
        }

    @classmethod
    def from_dict(cls, data: Dict) -> "CacheEntry":
        # Support both "cid" and legacy "content_hash"
        cid = data.get("cid") or data.get("content_hash", "")
        return cls(
            node_id=data["node_id"],
            output_path=Path(data["output_path"]),
            created_at=data["created_at"],
            size_bytes=data["size_bytes"],
            node_type=data["node_type"],
            cid=cid,
            execution_time=data.get("execution_time", 0.0),
        )


@dataclass
class CacheStats:
    """Statistics about cache usage."""
    total_entries: int = 0
    total_size_bytes: int = 0
    hits: int = 0
    misses: int = 0
    hit_rate: float = 0.0

    def record_hit(self):
        self.hits += 1
        self._update_rate()

    def record_miss(self):
        self.misses += 1
        self._update_rate()

    def _update_rate(self):
        total = self.hits + self.misses
        self.hit_rate = self.hits / total if total > 0 else 0.0


class Cache:
    """
    Code-addressed file cache.

    The filesystem IS the index - no JSON index files needed.
    Each node's hash is its directory name.

    Structure:
        cache_dir/
            <hash>/
                output.ext       # Actual output file
                metadata.json    # Per-node metadata (optional)
    """

    def __init__(self, cache_dir: Path | str):
        self.cache_dir = Path(cache_dir)
        self.cache_dir.mkdir(parents=True, exist_ok=True)
        self.stats = CacheStats()

    def _node_dir(self, node_id: str) -> Path:
        """Get the cache directory for a node."""
        return self.cache_dir / node_id

    def _find_output_file(self, node_dir: Path) -> Optional[Path]:
        """Find the output file in a node directory."""
        if not node_dir.exists() or not node_dir.is_dir():
            return None
        for f in node_dir.iterdir():
            if f.is_file() and f.name.startswith("output."):
                return f
        return None

    def get(self, node_id: str) -> Optional[Path]:
        """
        Get cached output path for a node.

        Checks filesystem directly - no in-memory index.
        Returns the output path if cached, None otherwise.
        """
        node_dir = self._node_dir(node_id)
        output_file = self._find_output_file(node_dir)

        if output_file:
            self.stats.record_hit()
            logger.debug(f"Cache hit: {node_id[:16]}...")
            return output_file

        self.stats.record_miss()
        return None

    def put(self, node_id: str, source_path: Path, node_type: str,
            execution_time: float = 0.0, move: bool = False) -> Path:
        """
        Store a file in the cache.

        Args:
            node_id: The code-addressed node ID (hash)
            source_path: Path to the file to cache
            node_type: Type of the node (for metadata)
            execution_time: How long the node took to execute
            move: If True, move the file instead of copying

        Returns:
            Path to the cached file
        """
        node_dir = self._node_dir(node_id)
        node_dir.mkdir(parents=True, exist_ok=True)

        # Preserve extension
        ext = source_path.suffix or ".out"
        output_path = node_dir / f"output{ext}"

        # Copy or move file (skip if already in place)
        source_resolved = Path(source_path).resolve()
        output_resolved = output_path.resolve()
        if source_resolved != output_resolved:
            if move:
                shutil.move(source_path, output_path)
            else:
                shutil.copy2(source_path, output_path)

        # Compute content hash (IPFS CID of the result)
        cid = _file_hash(output_path)

        # Store per-node metadata (optional, for stats/debugging)
        metadata = {
            "node_id": node_id,
            "output_path": str(output_path),
            "created_at": time.time(),
            "size_bytes": output_path.stat().st_size,
            "node_type": node_type,
            "cid": cid,
            "execution_time": execution_time,
        }
        metadata_path = node_dir / "metadata.json"
        with open(metadata_path, "w") as f:
            json.dump(metadata, f, indent=2)

        logger.debug(f"Cached: {node_id[:16]}... ({metadata['size_bytes']} bytes)")
        return output_path

    def has(self, node_id: str) -> bool:
        """Check if a node is cached (without affecting stats)."""
        return self._find_output_file(self._node_dir(node_id)) is not None

    def remove(self, node_id: str) -> bool:
        """Remove a node from the cache."""
        node_dir = self._node_dir(node_id)
        if node_dir.exists():
            shutil.rmtree(node_dir)
            return True
        return False

    def clear(self):
        """Clear all cached entries."""
        for node_dir in self.cache_dir.iterdir():
            if node_dir.is_dir() and not node_dir.name.startswith("_"):
                shutil.rmtree(node_dir)
        self.stats = CacheStats()

    def get_stats(self) -> CacheStats:
        """Get cache statistics (scans filesystem)."""
        stats = CacheStats()
        for node_dir in self.cache_dir.iterdir():
            if node_dir.is_dir() and not node_dir.name.startswith("_"):
                output_file = self._find_output_file(node_dir)
                if output_file:
                    stats.total_entries += 1
                    stats.total_size_bytes += output_file.stat().st_size
        stats.hits = self.stats.hits
        stats.misses = self.stats.misses
        stats.hit_rate = self.stats.hit_rate
        return stats

    def list_entries(self) -> List[CacheEntry]:
        """List all cache entries (scans filesystem)."""
        entries = []
        for node_dir in self.cache_dir.iterdir():
            if node_dir.is_dir() and not node_dir.name.startswith("_"):
                entry = self._load_entry_from_disk(node_dir.name)
                if entry:
                    entries.append(entry)
        return entries

    def _load_entry_from_disk(self, node_id: str) -> Optional[CacheEntry]:
        """Load entry metadata from disk."""
        node_dir = self._node_dir(node_id)
        metadata_path = node_dir / "metadata.json"
        output_file = self._find_output_file(node_dir)

        if not output_file:
            return None

        if metadata_path.exists():
            try:
                with open(metadata_path) as f:
                    data = json.load(f)
                return CacheEntry.from_dict(data)
            except (json.JSONDecodeError, KeyError):
                pass

        # Fallback: create entry from filesystem
        return CacheEntry(
            node_id=node_id,
            output_path=output_file,
            created_at=output_file.stat().st_mtime,
            size_bytes=output_file.stat().st_size,
            node_type="unknown",
            cid=_file_hash(output_file),
        )

    def get_entry(self, node_id: str) -> Optional[CacheEntry]:
        """Get cache entry metadata (without affecting stats)."""
        return self._load_entry_from_disk(node_id)

    def find_by_cid(self, cid: str) -> Optional[CacheEntry]:
        """Find a cache entry by its content hash (scans filesystem)."""
        for entry in self.list_entries():
            if entry.cid == cid:
                return entry
        return None

    def prune(self, max_size_bytes: int = None, max_age_seconds: float = None) -> int:
        """
        Prune cache based on size or age.

        Args:
            max_size_bytes: Remove oldest entries until under this size
            max_age_seconds: Remove entries older than this

        Returns:
            Number of entries removed
        """
        removed = 0
        now = time.time()
        entries = self.list_entries()

        # Remove by age first
        if max_age_seconds is not None:
            for entry in entries:
                if now - entry.created_at > max_age_seconds:
                    self.remove(entry.node_id)
                    removed += 1

        # Then by size (remove oldest first)
        if max_size_bytes is not None:
            stats = self.get_stats()
            if stats.total_size_bytes > max_size_bytes:
                sorted_entries = sorted(entries, key=lambda e: e.created_at)
                total_size = stats.total_size_bytes
                for entry in sorted_entries:
                    if total_size <= max_size_bytes:
                        break
                    self.remove(entry.node_id)
                    total_size -= entry.size_bytes
                    removed += 1

        return removed

    def get_output_path(self, node_id: str, extension: str = ".mkv") -> Path:
        """Get the output path for a node (creates directory if needed)."""
        node_dir = self._node_dir(node_id)
        node_dir.mkdir(parents=True, exist_ok=True)
        return node_dir / f"output{extension}"

    # Effect storage methods

    def _effects_dir(self) -> Path:
        """Get the effects subdirectory."""
        effects_dir = self.cache_dir / "_effects"
        effects_dir.mkdir(parents=True, exist_ok=True)
        return effects_dir

    def store_effect(self, source: str) -> str:
        """
        Store an effect in the cache.

        Args:
            source: Effect source code

        Returns:
            Content hash (cache ID) of the effect
        """
        import hashlib as _hashlib

        # Compute content hash
        cid = _hashlib.sha3_256(source.encode("utf-8")).hexdigest()

        # Try to load full metadata if effects module available
        try:
            from .effects.loader import load_effect
            loaded = load_effect(source)
            meta_dict = loaded.meta.to_dict()
            dependencies = loaded.dependencies
            requires_python = loaded.requires_python
        except ImportError:
            # Fallback: store without parsed metadata
            meta_dict = {}
            dependencies = []
            requires_python = ">=3.10"

        effect_dir = self._effects_dir() / cid
        effect_dir.mkdir(parents=True, exist_ok=True)

        # Store source
        source_path = effect_dir / "effect.py"
        source_path.write_text(source, encoding="utf-8")

        # Store metadata
        metadata = {
            "cid": cid,
            "meta": meta_dict,
            "dependencies": dependencies,
            "requires_python": requires_python,
            "stored_at": time.time(),
        }
        metadata_path = effect_dir / "metadata.json"
        with open(metadata_path, "w") as f:
            json.dump(metadata, f, indent=2)

        logger.info(f"Stored effect '{loaded.meta.name}' with hash {cid[:16]}...")
        return cid

    def get_effect(self, cid: str) -> Optional[str]:
        """
        Get effect source by content hash.

        Args:
            cid: SHA3-256 hash of effect source

        Returns:
            Effect source code if found, None otherwise
        """
        effect_dir = self._effects_dir() / cid
        source_path = effect_dir / "effect.py"

        if not source_path.exists():
            return None

        return source_path.read_text(encoding="utf-8")

    def get_effect_path(self, cid: str) -> Optional[Path]:
        """
        Get path to effect source file.

        Args:
            cid: SHA3-256 hash of effect source

        Returns:
            Path to effect.py if found, None otherwise
        """
        effect_dir = self._effects_dir() / cid
        source_path = effect_dir / "effect.py"

        if not source_path.exists():
            return None

        return source_path

    def get_effect_metadata(self, cid: str) -> Optional[dict]:
        """
        Get effect metadata by content hash.

        Args:
            cid: SHA3-256 hash of effect source

        Returns:
            Metadata dict if found, None otherwise
        """
        effect_dir = self._effects_dir() / cid
        metadata_path = effect_dir / "metadata.json"

        if not metadata_path.exists():
            return None

        try:
            with open(metadata_path) as f:
                return json.load(f)
        except (json.JSONDecodeError, KeyError):
            return None

    def has_effect(self, cid: str) -> bool:
        """Check if an effect is cached."""
        effect_dir = self._effects_dir() / cid
        return (effect_dir / "effect.py").exists()

    def list_effects(self) -> List[dict]:
        """List all cached effects with their metadata."""
        effects = []
        effects_dir = self._effects_dir()

        if not effects_dir.exists():
            return effects

        for effect_dir in effects_dir.iterdir():
            if effect_dir.is_dir():
                metadata = self.get_effect_metadata(effect_dir.name)
                if metadata:
                    effects.append(metadata)

        return effects

    def remove_effect(self, cid: str) -> bool:
        """Remove an effect from the cache."""
        effect_dir = self._effects_dir() / cid

        if not effect_dir.exists():
            return False

        shutil.rmtree(effect_dir)
        logger.info(f"Removed effect {cid[:16]}...")
        return True