Refactor storage: remove Redis duplication, use proper data tiers

- Recipes: Now content-addressed only (cache + IPFS), removed Redis storage - Runs: Completed runs stored in PostgreSQL, Redis only for task_id mapping - Add list_runs_by_actor() to database.py for paginated run queries - Add list_by_type() to cache_manager for filtering by node_type - Fix upload endpoint to return size and filename fields - Fix recipe run endpoint with proper DAG input binding - Fix get_run_service() dependency to pass database module Storage architecture: - Redis: Ephemeral only (sessions, task mappings with TTL) - PostgreSQL: Permanent records (completed runs, metadata) - Cache: Content-addressed files (recipes, media, outputs) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-11 14:05:31 +00:00
parent 8591faf0fc
commit 854396680f
8 changed files with 965 additions and 264 deletions
--- a/app/services/cache_service.py
+++ b/app/services/cache_service.py
@@ -2,10 +2,81 @@
 Cache Service - business logic for cache and media management.
 """

+import asyncio
+import json
+import os
+import subprocess
 from pathlib import Path
-from typing import Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any, Tuple

-from artdag_common.utils.media import detect_media_type, get_mime_type
+import httpx
+
+
+def detect_media_type(cache_path: Path) -> str:
+    """Detect if file is image, video, or audio based on magic bytes."""
+    try:
+        with open(cache_path, "rb") as f:
+            header = f.read(32)
+    except Exception:
+        return "unknown"
+
+    # Video signatures
+    if header[:4] == b'\x1a\x45\xdf\xa3':  # WebM/MKV
+        return "video"
+    if len(header) > 8 and header[4:8] == b'ftyp':  # MP4/MOV
+        return "video"
+    if header[:4] == b'RIFF' and len(header) > 12 and header[8:12] == b'AVI ':  # AVI
+        return "video"
+
+    # Image signatures
+    if header[:8] == b'\x89PNG\r\n\x1a\n':  # PNG
+        return "image"
+    if header[:2] == b'\xff\xd8':  # JPEG
+        return "image"
+    if header[:6] in (b'GIF87a', b'GIF89a'):  # GIF
+        return "image"
+    if header[:4] == b'RIFF' and len(header) > 12 and header[8:12] == b'WEBP':  # WebP
+        return "image"
+
+    # Audio signatures
+    if header[:4] == b'RIFF' and len(header) > 12 and header[8:12] == b'WAVE':  # WAV
+        return "audio"
+    if header[:3] == b'ID3' or header[:2] == b'\xff\xfb':  # MP3
+        return "audio"
+    if header[:4] == b'fLaC':  # FLAC
+        return "audio"
+
+    return "unknown"
+
+
+def get_mime_type(path: Path) -> str:
+    """Get MIME type based on file magic bytes."""
+    media_type = detect_media_type(path)
+    if media_type == "video":
+        try:
+            with open(path, "rb") as f:
+                header = f.read(12)
+            if header[:4] == b'\x1a\x45\xdf\xa3':
+                return "video/x-matroska"
+            return "video/mp4"
+        except Exception:
+            return "video/mp4"
+    elif media_type == "image":
+        try:
+            with open(path, "rb") as f:
+                header = f.read(8)
+            if header[:8] == b'\x89PNG\r\n\x1a\n':
+                return "image/png"
+            if header[:2] == b'\xff\xd8':
+                return "image/jpeg"
+            if header[:6] in (b'GIF87a', b'GIF89a'):
+                return "image/gif"
+            return "image/jpeg"
+        except Exception:
+            return "image/jpeg"
+    elif media_type == "audio":
+        return "audio/mpeg"
+    return "application/octet-stream"


 class CacheService:
@@ -15,18 +86,24 @@ class CacheService:
    Handles content retrieval, metadata, and media type detection.
    """

-    def __init__(self, cache_manager, database):
-        self.cache = cache_manager
+    def __init__(self, database, cache_manager):
        self.db = database
+        self.cache = cache_manager
+        self.cache_dir = Path(os.environ.get("CACHE_DIR", "/tmp/artdag-cache"))

-    async def get_item(self, content_hash: str) -> Optional[Dict[str, Any]]:
-        """Get cached item by content hash."""
-        path = self.cache.get_by_content_hash(content_hash)
+    async def get_cache_item(self, content_hash: str) -> Optional[Dict[str, Any]]:
+        """Get cached item with full metadata for display."""
+        # Check if content exists
+        if not self.cache.has_content(content_hash):
+            return None
+
+        path = self.cache.get_content_path(content_hash)
        if not path or not path.exists():
            return None

        # Get metadata from database
-        meta = await self.db.get_cache_item(content_hash)
+        meta = await self.db.load_item_metadata(content_hash, None)
+        cache_item = await self.db.get_cache_item(content_hash)

        media_type = detect_media_type(path)
        mime_type = get_mime_type(path)
@@ -38,76 +115,370 @@ class CacheService:
            "media_type": media_type,
            "mime_type": mime_type,
            "size": size,
-            "name": meta.get("name") if meta else None,
-            "description": meta.get("description") if meta else None,
-            "tags": meta.get("tags", []) if meta else [],
-            "ipfs_cid": meta.get("ipfs_cid") if meta else None,
+            "ipfs_cid": cache_item.get("ipfs_cid") if cache_item else None,
+            "meta": meta,
        }

-    async def get_path(self, content_hash: str) -> Optional[Path]:
-        """Get the file path for cached content."""
-        return self.cache.get_by_content_hash(content_hash)
+    async def check_access(self, content_hash: str, actor_id: str, username: str) -> bool:
+        """Check if user has access to content."""
+        user_hashes = await self._get_user_cache_hashes(username, actor_id)
+        return content_hash in user_hashes

-    async def list_items(
-        self,
-        actor_id: str = None,
-        media_type: str = None,
-        page: int = 1,
-        limit: int = 20,
-    ) -> Dict[str, Any]:
-        """List cached items with filters and pagination."""
-        # Get items from database
-        items = await self.db.list_cache_items(
-            actor_id=actor_id,
-            media_type=media_type,
-            offset=(page - 1) * limit,
-            limit=limit,
-        )
+    async def _get_user_cache_hashes(self, username: str, actor_id: Optional[str] = None) -> set:
+        """Get all cache hashes owned by or associated with a user."""
+        match_values = [username]
+        if actor_id:
+            match_values.append(actor_id)

-        total = await self.db.count_cache_items(actor_id=actor_id, media_type=media_type)
+        hashes = set()

-        return {
-            "items": items,
-            "pagination": {
-                "page": page,
-                "limit": limit,
-                "total": total,
-                "has_more": page * limit < total,
-            }
-        }
+        # Query database for items owned by user
+        if actor_id:
+            try:
+                db_items = await self.db.get_user_items(actor_id)
+                for item in db_items:
+                    hashes.add(item["content_hash"])
+            except Exception:
+                pass
+
+        # Legacy: Files uploaded by user (JSON metadata)
+        if self.cache_dir.exists():
+            for f in self.cache_dir.iterdir():
+                if f.name.endswith('.meta.json'):
+                    try:
+                        with open(f, 'r') as mf:
+                            meta = json.load(mf)
+                        if meta.get("uploader") in match_values:
+                            hashes.add(f.name.replace('.meta.json', ''))
+                    except Exception:
+                        pass
+
+        # Files from user's runs (inputs and outputs)
+        runs = await self._list_user_runs(username, actor_id)
+        for run in runs:
+            inputs = run.get("inputs", [])
+            if isinstance(inputs, dict):
+                inputs = list(inputs.values())
+            hashes.update(inputs)
+            if run.get("output_hash"):
+                hashes.add(run["output_hash"])
+
+        return hashes
+
+    async def _list_user_runs(self, username: str, actor_id: Optional[str]) -> List[Dict]:
+        """List runs for a user (helper for access check)."""
+        from ..dependencies import get_redis_client
+        import json
+
+        redis = get_redis_client()
+        runs = []
+        cursor = 0
+        prefix = "artdag:run:"
+
+        while True:
+            cursor, keys = redis.scan(cursor=cursor, match=f"{prefix}*", count=100)
+            for key in keys:
+                data = redis.get(key)
+                if data:
+                    run = json.loads(data)
+                    if run.get("actor_id") in (username, actor_id) or run.get("username") in (username, actor_id):
+                        runs.append(run)
+            if cursor == 0:
+                break
+
+        return runs
+
+    async def get_raw_file(self, content_hash: str) -> Tuple[Optional[Path], Optional[str], Optional[str]]:
+        """Get raw file path, media type, and filename for download."""
+        if not self.cache.has_content(content_hash):
+            return None, None, None
+
+        path = self.cache.get_content_path(content_hash)
+        if not path or not path.exists():
+            return None, None, None
+
+        media_type = detect_media_type(path)
+        mime = get_mime_type(path)
+
+        # Determine extension
+        ext = "bin"
+        if media_type == "video":
+            try:
+                with open(path, "rb") as f:
+                    header = f.read(12)
+                if header[:4] == b'\x1a\x45\xdf\xa3':
+                    ext = "mkv"
+                else:
+                    ext = "mp4"
+            except Exception:
+                ext = "mp4"
+        elif media_type == "image":
+            try:
+                with open(path, "rb") as f:
+                    header = f.read(8)
+                if header[:8] == b'\x89PNG\r\n\x1a\n':
+                    ext = "png"
+                else:
+                    ext = "jpg"
+            except Exception:
+                ext = "jpg"
+
+        filename = f"{content_hash}.{ext}"
+        return path, mime, filename
+
+    async def get_as_mp4(self, content_hash: str) -> Tuple[Optional[Path], Optional[str]]:
+        """Get content as MP4, transcoding if necessary. Returns (path, error)."""
+        if not self.cache.has_content(content_hash):
+            return None, f"Content {content_hash} not in cache"
+
+        path = self.cache.get_content_path(content_hash)
+        if not path or not path.exists():
+            return None, f"Content {content_hash} not in cache"
+
+        # Check if video
+        media_type = detect_media_type(path)
+        if media_type != "video":
+            return None, "Content is not a video"
+
+        # Check for cached MP4
+        mp4_path = self.cache_dir / f"{content_hash}.mp4"
+        if mp4_path.exists():
+            return mp4_path, None
+
+        # Check if already MP4 format
+        try:
+            result = subprocess.run(
+                ["ffprobe", "-v", "error", "-select_streams", "v:0",
+                 "-show_entries", "format=format_name", "-of", "csv=p=0", str(path)],
+                capture_output=True, text=True, timeout=10
+            )
+            if "mp4" in result.stdout.lower() or "mov" in result.stdout.lower():
+                return path, None
+        except Exception:
+            pass
+
+        # Transcode to MP4
+        transcode_path = self.cache_dir / f"{content_hash}.transcoding.mp4"
+        try:
+            result = subprocess.run(
+                ["ffmpeg", "-y", "-i", str(path),
+                 "-c:v", "libx264", "-preset", "fast", "-crf", "23",
+                 "-c:a", "aac", "-b:a", "128k",
+                 "-movflags", "+faststart",
+                 str(transcode_path)],
+                capture_output=True, text=True, timeout=600
+            )
+            if result.returncode != 0:
+                return None, f"Transcoding failed: {result.stderr[:200]}"
+
+            transcode_path.rename(mp4_path)
+            return mp4_path, None
+
+        except subprocess.TimeoutExpired:
+            if transcode_path.exists():
+                transcode_path.unlink()
+            return None, "Transcoding timed out"
+        except Exception as e:
+            if transcode_path.exists():
+                transcode_path.unlink()
+            return None, f"Transcoding failed: {e}"
+
+    async def get_metadata(self, content_hash: str, actor_id: str) -> Optional[Dict[str, Any]]:
+        """Get content metadata."""
+        if not self.cache.has_content(content_hash):
+            return None
+        return await self.db.load_item_metadata(content_hash, actor_id)

    async def update_metadata(
        self,
        content_hash: str,
-        name: str = None,
+        actor_id: str,
+        title: str = None,
        description: str = None,
        tags: List[str] = None,
-    ) -> bool:
-        """Update item metadata."""
-        return await self.db.update_cache_metadata(
+        custom: Dict[str, Any] = None,
+    ) -> Tuple[bool, Optional[str]]:
+        """Update content metadata. Returns (success, error)."""
+        if not self.cache.has_content(content_hash):
+            return False, "Content not found"
+
+        # Build update dict
+        updates = {}
+        if title is not None:
+            updates["title"] = title
+        if description is not None:
+            updates["description"] = description
+        if tags is not None:
+            updates["tags"] = tags
+        if custom is not None:
+            updates["custom"] = custom
+
+        try:
+            await self.db.update_item_metadata(content_hash, actor_id, **updates)
+            return True, None
+        except Exception as e:
+            return False, str(e)
+
+    async def publish_to_l2(
+        self,
+        content_hash: str,
+        actor_id: str,
+        l2_server: str,
+        auth_token: str,
+    ) -> Tuple[Optional[str], Optional[str]]:
+        """Publish content to L2 and IPFS. Returns (ipfs_cid, error)."""
+        if not self.cache.has_content(content_hash):
+            return None, "Content not found"
+
+        # Get IPFS CID
+        cache_item = await self.db.get_cache_item(content_hash)
+        ipfs_cid = cache_item.get("ipfs_cid") if cache_item else None
+
+        # Get metadata for origin info
+        meta = await self.db.load_item_metadata(content_hash, actor_id)
+        origin = meta.get("origin") if meta else None
+
+        if not origin or "type" not in origin:
+            return None, "Origin must be set before publishing"
+
+        if not auth_token:
+            return None, "Authentication token required"
+
+        # Call L2 publish-cache endpoint
+        try:
+            async with httpx.AsyncClient(timeout=30) as client:
+                resp = await client.post(
+                    f"{l2_server}/assets/publish-cache",
+                    headers={"Authorization": f"Bearer {auth_token}"},
+                    json={
+                        "content_hash": content_hash,
+                        "ipfs_cid": ipfs_cid,
+                        "asset_name": meta.get("title") or content_hash[:16],
+                        "asset_type": detect_media_type(self.cache.get_content_path(content_hash)),
+                        "origin": origin,
+                        "description": meta.get("description"),
+                        "tags": meta.get("tags", []),
+                    }
+                )
+                resp.raise_for_status()
+                l2_result = resp.json()
+        except httpx.HTTPStatusError as e:
+            error_detail = str(e)
+            try:
+                error_detail = e.response.json().get("detail", str(e))
+            except Exception:
+                pass
+            return None, f"L2 publish failed: {error_detail}"
+        except Exception as e:
+            return None, f"L2 publish failed: {e}"
+
+        # Update local metadata with publish status
+        await self.db.save_l2_share(
            content_hash=content_hash,
-            name=name,
-            description=description,
-            tags=tags,
+            actor_id=actor_id,
+            l2_server=l2_server,
+            asset_name=meta.get("title") or content_hash[:16],
+            content_type=detect_media_type(self.cache.get_content_path(content_hash))
+        )
+        await self.db.update_item_metadata(
+            content_hash=content_hash,
+            actor_id=actor_id,
+            pinned=True,
+            pin_reason="published"
        )

-    async def delete_item(self, content_hash: str) -> bool:
-        """Delete a cached item."""
-        path = self.cache.get_by_content_hash(content_hash)
-        if path and path.exists():
-            path.unlink()
+        return l2_result.get("ipfs_cid") or ipfs_cid, None

-        # Remove from database
-        await self.db.delete_cache_item(content_hash)
-        return True
+    async def delete_content(self, content_hash: str, actor_id: str) -> Tuple[bool, Optional[str]]:
+        """Delete content from cache. Returns (success, error)."""
+        if not self.cache.has_content(content_hash):
+            return False, "Content not found"

-    def has_content(self, content_hash: str) -> bool:
-        """Check if content exists in cache."""
-        return self.cache.has_content(content_hash)
+        # Check if pinned
+        meta = await self.db.load_item_metadata(content_hash, actor_id)
+        if meta and meta.get("pinned"):
+            pin_reason = meta.get("pin_reason", "unknown")
+            return False, f"Cannot discard pinned item (reason: {pin_reason})"

-    def get_ipfs_cid(self, content_hash: str) -> Optional[str]:
-        """Get IPFS CID for cached content."""
-        return self.cache.get_ipfs_cid(content_hash)
+        # Check deletion rules via cache_manager
+        can_delete, reason = self.cache.can_delete(content_hash)
+        if not can_delete:
+            return False, f"Cannot discard: {reason}"
+
+        # Delete via cache_manager
+        success, msg = self.cache.delete_by_content_hash(content_hash)
+
+        # Clean up legacy metadata files
+        meta_path = self.cache_dir / f"{content_hash}.meta.json"
+        if meta_path.exists():
+            meta_path.unlink()
+        mp4_path = self.cache_dir / f"{content_hash}.mp4"
+        if mp4_path.exists():
+            mp4_path.unlink()
+
+        return True, None
+
+    async def import_from_ipfs(self, ipfs_cid: str, actor_id: str) -> Tuple[Optional[str], Optional[str]]:
+        """Import content from IPFS. Returns (content_hash, error)."""
+        try:
+            import ipfs_client
+
+            # Download from IPFS
+            legacy_dir = self.cache_dir / "legacy"
+            legacy_dir.mkdir(parents=True, exist_ok=True)
+            tmp_path = legacy_dir / f"import-{ipfs_cid[:16]}"
+
+            if not ipfs_client.get_file(ipfs_cid, str(tmp_path)):
+                return None, f"Could not fetch CID {ipfs_cid} from IPFS"
+
+            # Store in cache
+            cached, _ = self.cache.put(tmp_path, node_type="import", move=True)
+            content_hash = cached.content_hash
+
+            # Save to database
+            await self.db.create_cache_item(content_hash, ipfs_cid)
+            await self.db.save_item_metadata(
+                content_hash=content_hash,
+                actor_id=actor_id,
+                item_type="media",
+                filename=f"ipfs-{ipfs_cid[:16]}"
+            )
+
+            return content_hash, None
+        except Exception as e:
+            return None, f"Import failed: {e}"
+
+    async def upload_content(
+        self,
+        content: bytes,
+        filename: str,
+        actor_id: str,
+    ) -> Tuple[Optional[str], Optional[str]]:
+        """Upload content to cache. Returns (content_hash, error)."""
+        import tempfile
+
+        try:
+            # Write to temp file
+            with tempfile.NamedTemporaryFile(delete=False) as tmp:
+                tmp.write(content)
+                tmp_path = Path(tmp.name)
+
+            # Store in cache
+            cached, ipfs_cid = self.cache.put(tmp_path, node_type="upload", move=True)
+            content_hash = cached.content_hash
+
+            # Save to database
+            await self.db.create_cache_item(content_hash, ipfs_cid)
+            await self.db.save_item_metadata(
+                content_hash=content_hash,
+                actor_id=actor_id,
+                item_type="media",
+                filename=filename
+            )
+
+            return content_hash, None
+        except Exception as e:
+            return None, f"Upload failed: {e}"

    async def list_media(
        self,
@@ -118,12 +489,20 @@ class CacheService:
        media_type: str = None,
    ) -> List[Dict[str, Any]]:
        """List media items in cache."""
-        # Use list_items internally, converting offset to page
-        page = (offset // limit) + 1 if limit > 0 else 1
-        result = await self.list_items(
+        # Get items from database
+        items = await self.db.list_cache_items(
            actor_id=actor_id or username,
            media_type=media_type,
-            page=page,
+            offset=offset,
            limit=limit,
        )
-        return result.get("items", [])
+        return items
+
+    # Legacy compatibility methods
+    def has_content(self, content_hash: str) -> bool:
+        """Check if content exists in cache."""
+        return self.cache.has_content(content_hash)
+
+    def get_ipfs_cid(self, content_hash: str) -> Optional[str]:
+        """Get IPFS CID for cached content."""
+        return self.cache.get_ipfs_cid(content_hash)