test/cache.py

#!/usr/bin/env python3
"""
Unified content cache for artdag.

Design:
    - IPNS (cache_id) = computation hash, known BEFORE execution
      "What would be the result of running X with inputs Y?"

    - CID = content hash, known AFTER execution
      "What is this actual content?"

Structure:
    .cache/
        refs/             # IPNS → CID mappings (computation → result)
            {cache_id}    # Text file containing the CID of the result
        content/          # Content-addressed storage
            {cid[:2]}/{cid}   # Actual content by CID
"""

import hashlib
import json
import os
from pathlib import Path
from typing import Optional, Dict, Any, Tuple

# Default cache location - can be overridden via ARTDAG_CACHE env var
DEFAULT_CACHE_DIR = Path(__file__).parent / ".cache"


def get_cache_dir() -> Path:
    """Get the cache directory, creating if needed."""
    cache_dir = Path(os.environ.get("ARTDAG_CACHE", DEFAULT_CACHE_DIR))
    cache_dir.mkdir(parents=True, exist_ok=True)
    return cache_dir


def get_refs_dir() -> Path:
    """Get the refs directory (IPNS → CID mappings)."""
    refs_dir = get_cache_dir() / "refs"
    refs_dir.mkdir(parents=True, exist_ok=True)
    return refs_dir


def get_content_dir() -> Path:
    """Get the content directory (CID → content)."""
    content_dir = get_cache_dir() / "content"
    content_dir.mkdir(parents=True, exist_ok=True)
    return content_dir


# =============================================================================
# CID (Content Hash) Operations
# =============================================================================

def compute_cid(content: bytes) -> str:
    """Compute content ID (SHA256 hash) for bytes."""
    return hashlib.sha256(content).hexdigest()


def compute_file_cid(file_path: Path) -> str:
    """Compute content ID for a file."""
    with open(file_path, 'rb') as f:
        return compute_cid(f.read())


def compute_string_cid(text: str) -> str:
    """Compute content ID for a string."""
    return compute_cid(text.encode('utf-8'))


# =============================================================================
# Content Storage (by CID)
# =============================================================================

def _content_path(cid: str) -> Path:
    """Get path for content by CID."""
    return get_content_dir() / cid[:2] / cid


def content_exists_by_cid(cid: str) -> Optional[Path]:
    """Check if content exists by CID."""
    path = _content_path(cid)
    if path.exists() and path.stat().st_size > 0:
        return path
    return None


def content_store_by_cid(cid: str, content: bytes) -> Path:
    """Store content by its CID."""
    path = _content_path(cid)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_bytes(content)
    return path


def content_store_file(file_path: Path) -> Tuple[str, Path]:
    """Store a file by its content hash. Returns (cid, path)."""
    content = file_path.read_bytes()
    cid = compute_cid(content)
    path = content_store_by_cid(cid, content)
    return cid, path


def content_store_string(text: str) -> Tuple[str, Path]:
    """Store a string by its content hash. Returns (cid, path)."""
    content = text.encode('utf-8')
    cid = compute_cid(content)
    path = content_store_by_cid(cid, content)
    return cid, path


def content_get(cid: str) -> Optional[bytes]:
    """Get content by CID."""
    path = content_exists_by_cid(cid)
    if path:
        return path.read_bytes()
    return None


def content_get_string(cid: str) -> Optional[str]:
    """Get string content by CID."""
    content = content_get(cid)
    if content:
        return content.decode('utf-8')
    return None


# =============================================================================
# Refs (IPNS → CID mappings)
# =============================================================================

def _ref_path(cache_id: str) -> Path:
    """Get path for a ref by cache_id."""
    return get_refs_dir() / cache_id


def ref_exists(cache_id: str) -> Optional[str]:
    """Check if a ref exists. Returns CID if found."""
    path = _ref_path(cache_id)
    if path.exists():
        return path.read_text().strip()
    return None


def ref_set(cache_id: str, cid: str) -> Path:
    """Set a ref (IPNS → CID mapping)."""
    path = _ref_path(cache_id)
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(cid)
    return path


def ref_get_content(cache_id: str) -> Optional[bytes]:
    """Get content by cache_id (looks up ref, then fetches content)."""
    cid = ref_exists(cache_id)
    if cid:
        return content_get(cid)
    return None


def ref_get_string(cache_id: str) -> Optional[str]:
    """Get string content by cache_id."""
    content = ref_get_content(cache_id)
    if content:
        return content.decode('utf-8')
    return None


# =============================================================================
# High-level Cache Operations
# =============================================================================

def cache_store(cache_id: str, content: bytes) -> Tuple[str, Path]:
    """
    Store content with IPNS→CID indirection.

    Args:
        cache_id: Computation hash (IPNS address)
        content: Content to store

    Returns:
        (cid, path) tuple
    """
    cid = compute_cid(content)
    path = content_store_by_cid(cid, content)
    ref_set(cache_id, cid)
    return cid, path


def cache_store_file(cache_id: str, file_path: Path) -> Tuple[str, Path]:
    """Store a file with IPNS→CID indirection."""
    content = file_path.read_bytes()
    return cache_store(cache_id, content)


def cache_store_string(cache_id: str, text: str) -> Tuple[str, Path]:
    """Store a string with IPNS→CID indirection."""
    return cache_store(cache_id, text.encode('utf-8'))


def cache_store_json(cache_id: str, data: Any) -> Tuple[str, Path]:
    """Store JSON data with IPNS→CID indirection."""
    text = json.dumps(data, indent=2)
    return cache_store_string(cache_id, text)


def cache_exists(cache_id: str) -> Optional[Path]:
    """Check if cached content exists for a computation."""
    cid = ref_exists(cache_id)
    if cid:
        return content_exists_by_cid(cid)
    return None


def cache_get(cache_id: str) -> Optional[bytes]:
    """Get cached content by computation hash."""
    return ref_get_content(cache_id)


def cache_get_string(cache_id: str) -> Optional[str]:
    """Get cached string by computation hash."""
    return ref_get_string(cache_id)


def cache_get_json(cache_id: str) -> Optional[Any]:
    """Get cached JSON by computation hash."""
    text = cache_get_string(cache_id)
    if text:
        return json.loads(text)
    return None


def cache_get_path(cache_id: str) -> Optional[Path]:
    """Get path to cached content by computation hash."""
    cid = ref_exists(cache_id)
    if cid:
        return content_exists_by_cid(cid)
    return None


# =============================================================================
# Plan Cache (convenience wrappers)
# =============================================================================

def _stable_hash_params(params: Dict[str, Any]) -> str:
    """Compute stable hash of params using JSON + SHA256 (consistent with CID)."""
    params_str = json.dumps(params, sort_keys=True, default=str)
    return hashlib.sha256(params_str.encode()).hexdigest()


def plan_cache_id(source_cid: str, params: Dict[str, Any] = None) -> str:
    """
    Compute the cache_id (IPNS address) for a plan.

    Based on source CID + params. Name/version are just metadata.
    """
    key = f"plan:{source_cid}"
    if params:
        params_hash = _stable_hash_params(params)
        key = f"{key}:{params_hash}"
    return hashlib.sha256(key.encode()).hexdigest()


def plan_exists(source_cid: str, params: Dict[str, Any] = None) -> Optional[str]:
    """Check if a cached plan exists. Returns CID if found."""
    cache_id = plan_cache_id(source_cid, params)
    return ref_exists(cache_id)


def plan_store(source_cid: str, params: Dict[str, Any], content: str) -> Tuple[str, str, Path]:
    """
    Store a plan in the cache.

    Returns:
        (cache_id, cid, path) tuple
    """
    cache_id = plan_cache_id(source_cid, params)
    cid, path = cache_store_string(cache_id, content)
    return cache_id, cid, path


def plan_load(source_cid: str, params: Dict[str, Any] = None) -> Optional[str]:
    """Load a plan from cache. Returns plan content string."""
    cache_id = plan_cache_id(source_cid, params)
    return cache_get_string(cache_id)


def plan_get_path(source_cid: str, params: Dict[str, Any] = None) -> Optional[Path]:
    """Get path to cached plan."""
    cache_id = plan_cache_id(source_cid, params)
    return cache_get_path(cache_id)


# =============================================================================
# Cache Listing
# =============================================================================

def list_cache(verbose: bool = False) -> Dict[str, Any]:
    """List all cached items."""
    from datetime import datetime

    cache_dir = get_cache_dir()
    refs_dir = get_refs_dir()
    content_dir = get_content_dir()

    def format_size(size):
        if size >= 1_000_000_000:
            return f"{size / 1_000_000_000:.1f}GB"
        elif size >= 1_000_000:
            return f"{size / 1_000_000:.1f}MB"
        elif size >= 1000:
            return f"{size / 1000:.1f}KB"
        else:
            return f"{size}B"

    def get_file_info(path: Path) -> Dict:
        stat = path.stat()
        return {
            "path": path,
            "name": path.name,
            "size": stat.st_size,
            "size_str": format_size(stat.st_size),
            "mtime": datetime.fromtimestamp(stat.st_mtime),
        }

    result = {
        "refs": [],
        "content": [],
        "summary": {"total_items": 0, "total_size": 0},
    }

    # Refs
    if refs_dir.exists():
        for f in sorted(refs_dir.iterdir()):
            if f.is_file():
                info = get_file_info(f)
                info["cache_id"] = f.name
                info["cid"] = f.read_text().strip()
                # Try to determine type from content
                cid = info["cid"]
                content_path = content_exists_by_cid(cid)
                if content_path:
                    info["content_size"] = content_path.stat().st_size
                    info["content_size_str"] = format_size(info["content_size"])
                result["refs"].append(info)

    # Content
    if content_dir.exists():
        for subdir in sorted(content_dir.iterdir()):
            if subdir.is_dir():
                for f in sorted(subdir.iterdir()):
                    if f.is_file():
                        info = get_file_info(f)
                        info["cid"] = f.name
                        result["content"].append(info)

    # Summary
    result["summary"]["total_refs"] = len(result["refs"])
    result["summary"]["total_content"] = len(result["content"])
    result["summary"]["total_size"] = sum(i["size"] for i in result["content"])
    result["summary"]["total_size_str"] = format_size(result["summary"]["total_size"])

    return result


def print_cache_listing(verbose: bool = False):
    """Print cache listing to stdout."""
    info = list_cache(verbose)
    cache_dir = get_cache_dir()

    print(f"\nCache directory: {cache_dir}\n")

    # Refs summary
    if info["refs"]:
        print(f"=== Refs ({len(info['refs'])}) ===")
        for ref in info["refs"][:20]:  # Show first 20
            content_info = f" → {ref.get('content_size_str', '?')}" if 'content_size_str' in ref else ""
            print(f"  {ref['cache_id'][:16]}... → {ref['cid'][:16]}...{content_info}")
        if len(info["refs"]) > 20:
            print(f"  ... and {len(info['refs']) - 20} more")
        print()

    # Content by type
    if info["content"]:
        # Group by first 2 chars (subdirectory)
        print(f"=== Content ({len(info['content'])} items, {info['summary']['total_size_str']}) ===")
        for item in info["content"][:20]:
            print(f"  {item['cid'][:16]}...  {item['size_str']:>8}  {item['mtime'].strftime('%Y-%m-%d %H:%M')}")
        if len(info["content"]) > 20:
            print(f"  ... and {len(info['content']) - 20} more")
        print()

    print(f"=== Summary ===")
    print(f"  Refs:    {info['summary']['total_refs']}")
    print(f"  Content: {info['summary']['total_content']} ({info['summary']['total_size_str']})")

    if verbose:
        print(f"\nTo clear cache: rm -rf {cache_dir}/*")


if __name__ == "__main__":
    import sys
    verbose = "-v" in sys.argv or "--verbose" in sys.argv
    print_cache_listing(verbose)