Refactor storage: remove Redis duplication, use proper data tiers

- Recipes: Now content-addressed only (cache + IPFS), removed Redis storage
- Runs: Completed runs stored in PostgreSQL, Redis only for task_id mapping
- Add list_runs_by_actor() to database.py for paginated run queries
- Add list_by_type() to cache_manager for filtering by node_type
- Fix upload endpoint to return size and filename fields
- Fix recipe run endpoint with proper DAG input binding
- Fix get_run_service() dependency to pass database module

Storage architecture:
- Redis: Ephemeral only (sessions, task mappings with TTL)
- PostgreSQL: Permanent records (completed runs, metadata)
- Cache: Content-addressed files (recipes, media, outputs)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
giles
2026-01-11 14:05:31 +00:00
parent 8591faf0fc
commit 854396680f
8 changed files with 965 additions and 264 deletions

View File

@@ -2,10 +2,81 @@
Cache Service - business logic for cache and media management.
"""
import asyncio
import json
import os
import subprocess
from pathlib import Path
from typing import Optional, List, Dict, Any
from typing import Optional, List, Dict, Any, Tuple
from artdag_common.utils.media import detect_media_type, get_mime_type
import httpx
def detect_media_type(cache_path: Path) -> str:
"""Detect if file is image, video, or audio based on magic bytes."""
try:
with open(cache_path, "rb") as f:
header = f.read(32)
except Exception:
return "unknown"
# Video signatures
if header[:4] == b'\x1a\x45\xdf\xa3': # WebM/MKV
return "video"
if len(header) > 8 and header[4:8] == b'ftyp': # MP4/MOV
return "video"
if header[:4] == b'RIFF' and len(header) > 12 and header[8:12] == b'AVI ': # AVI
return "video"
# Image signatures
if header[:8] == b'\x89PNG\r\n\x1a\n': # PNG
return "image"
if header[:2] == b'\xff\xd8': # JPEG
return "image"
if header[:6] in (b'GIF87a', b'GIF89a'): # GIF
return "image"
if header[:4] == b'RIFF' and len(header) > 12 and header[8:12] == b'WEBP': # WebP
return "image"
# Audio signatures
if header[:4] == b'RIFF' and len(header) > 12 and header[8:12] == b'WAVE': # WAV
return "audio"
if header[:3] == b'ID3' or header[:2] == b'\xff\xfb': # MP3
return "audio"
if header[:4] == b'fLaC': # FLAC
return "audio"
return "unknown"
def get_mime_type(path: Path) -> str:
"""Get MIME type based on file magic bytes."""
media_type = detect_media_type(path)
if media_type == "video":
try:
with open(path, "rb") as f:
header = f.read(12)
if header[:4] == b'\x1a\x45\xdf\xa3':
return "video/x-matroska"
return "video/mp4"
except Exception:
return "video/mp4"
elif media_type == "image":
try:
with open(path, "rb") as f:
header = f.read(8)
if header[:8] == b'\x89PNG\r\n\x1a\n':
return "image/png"
if header[:2] == b'\xff\xd8':
return "image/jpeg"
if header[:6] in (b'GIF87a', b'GIF89a'):
return "image/gif"
return "image/jpeg"
except Exception:
return "image/jpeg"
elif media_type == "audio":
return "audio/mpeg"
return "application/octet-stream"
class CacheService:
@@ -15,18 +86,24 @@ class CacheService:
Handles content retrieval, metadata, and media type detection.
"""
def __init__(self, cache_manager, database):
self.cache = cache_manager
def __init__(self, database, cache_manager):
self.db = database
self.cache = cache_manager
self.cache_dir = Path(os.environ.get("CACHE_DIR", "/tmp/artdag-cache"))
async def get_item(self, content_hash: str) -> Optional[Dict[str, Any]]:
"""Get cached item by content hash."""
path = self.cache.get_by_content_hash(content_hash)
async def get_cache_item(self, content_hash: str) -> Optional[Dict[str, Any]]:
"""Get cached item with full metadata for display."""
# Check if content exists
if not self.cache.has_content(content_hash):
return None
path = self.cache.get_content_path(content_hash)
if not path or not path.exists():
return None
# Get metadata from database
meta = await self.db.get_cache_item(content_hash)
meta = await self.db.load_item_metadata(content_hash, None)
cache_item = await self.db.get_cache_item(content_hash)
media_type = detect_media_type(path)
mime_type = get_mime_type(path)
@@ -38,76 +115,370 @@ class CacheService:
"media_type": media_type,
"mime_type": mime_type,
"size": size,
"name": meta.get("name") if meta else None,
"description": meta.get("description") if meta else None,
"tags": meta.get("tags", []) if meta else [],
"ipfs_cid": meta.get("ipfs_cid") if meta else None,
"ipfs_cid": cache_item.get("ipfs_cid") if cache_item else None,
"meta": meta,
}
async def get_path(self, content_hash: str) -> Optional[Path]:
"""Get the file path for cached content."""
return self.cache.get_by_content_hash(content_hash)
async def check_access(self, content_hash: str, actor_id: str, username: str) -> bool:
"""Check if user has access to content."""
user_hashes = await self._get_user_cache_hashes(username, actor_id)
return content_hash in user_hashes
async def list_items(
self,
actor_id: str = None,
media_type: str = None,
page: int = 1,
limit: int = 20,
) -> Dict[str, Any]:
"""List cached items with filters and pagination."""
# Get items from database
items = await self.db.list_cache_items(
actor_id=actor_id,
media_type=media_type,
offset=(page - 1) * limit,
limit=limit,
)
async def _get_user_cache_hashes(self, username: str, actor_id: Optional[str] = None) -> set:
"""Get all cache hashes owned by or associated with a user."""
match_values = [username]
if actor_id:
match_values.append(actor_id)
total = await self.db.count_cache_items(actor_id=actor_id, media_type=media_type)
hashes = set()
return {
"items": items,
"pagination": {
"page": page,
"limit": limit,
"total": total,
"has_more": page * limit < total,
}
}
# Query database for items owned by user
if actor_id:
try:
db_items = await self.db.get_user_items(actor_id)
for item in db_items:
hashes.add(item["content_hash"])
except Exception:
pass
# Legacy: Files uploaded by user (JSON metadata)
if self.cache_dir.exists():
for f in self.cache_dir.iterdir():
if f.name.endswith('.meta.json'):
try:
with open(f, 'r') as mf:
meta = json.load(mf)
if meta.get("uploader") in match_values:
hashes.add(f.name.replace('.meta.json', ''))
except Exception:
pass
# Files from user's runs (inputs and outputs)
runs = await self._list_user_runs(username, actor_id)
for run in runs:
inputs = run.get("inputs", [])
if isinstance(inputs, dict):
inputs = list(inputs.values())
hashes.update(inputs)
if run.get("output_hash"):
hashes.add(run["output_hash"])
return hashes
async def _list_user_runs(self, username: str, actor_id: Optional[str]) -> List[Dict]:
"""List runs for a user (helper for access check)."""
from ..dependencies import get_redis_client
import json
redis = get_redis_client()
runs = []
cursor = 0
prefix = "artdag:run:"
while True:
cursor, keys = redis.scan(cursor=cursor, match=f"{prefix}*", count=100)
for key in keys:
data = redis.get(key)
if data:
run = json.loads(data)
if run.get("actor_id") in (username, actor_id) or run.get("username") in (username, actor_id):
runs.append(run)
if cursor == 0:
break
return runs
async def get_raw_file(self, content_hash: str) -> Tuple[Optional[Path], Optional[str], Optional[str]]:
"""Get raw file path, media type, and filename for download."""
if not self.cache.has_content(content_hash):
return None, None, None
path = self.cache.get_content_path(content_hash)
if not path or not path.exists():
return None, None, None
media_type = detect_media_type(path)
mime = get_mime_type(path)
# Determine extension
ext = "bin"
if media_type == "video":
try:
with open(path, "rb") as f:
header = f.read(12)
if header[:4] == b'\x1a\x45\xdf\xa3':
ext = "mkv"
else:
ext = "mp4"
except Exception:
ext = "mp4"
elif media_type == "image":
try:
with open(path, "rb") as f:
header = f.read(8)
if header[:8] == b'\x89PNG\r\n\x1a\n':
ext = "png"
else:
ext = "jpg"
except Exception:
ext = "jpg"
filename = f"{content_hash}.{ext}"
return path, mime, filename
async def get_as_mp4(self, content_hash: str) -> Tuple[Optional[Path], Optional[str]]:
"""Get content as MP4, transcoding if necessary. Returns (path, error)."""
if not self.cache.has_content(content_hash):
return None, f"Content {content_hash} not in cache"
path = self.cache.get_content_path(content_hash)
if not path or not path.exists():
return None, f"Content {content_hash} not in cache"
# Check if video
media_type = detect_media_type(path)
if media_type != "video":
return None, "Content is not a video"
# Check for cached MP4
mp4_path = self.cache_dir / f"{content_hash}.mp4"
if mp4_path.exists():
return mp4_path, None
# Check if already MP4 format
try:
result = subprocess.run(
["ffprobe", "-v", "error", "-select_streams", "v:0",
"-show_entries", "format=format_name", "-of", "csv=p=0", str(path)],
capture_output=True, text=True, timeout=10
)
if "mp4" in result.stdout.lower() or "mov" in result.stdout.lower():
return path, None
except Exception:
pass
# Transcode to MP4
transcode_path = self.cache_dir / f"{content_hash}.transcoding.mp4"
try:
result = subprocess.run(
["ffmpeg", "-y", "-i", str(path),
"-c:v", "libx264", "-preset", "fast", "-crf", "23",
"-c:a", "aac", "-b:a", "128k",
"-movflags", "+faststart",
str(transcode_path)],
capture_output=True, text=True, timeout=600
)
if result.returncode != 0:
return None, f"Transcoding failed: {result.stderr[:200]}"
transcode_path.rename(mp4_path)
return mp4_path, None
except subprocess.TimeoutExpired:
if transcode_path.exists():
transcode_path.unlink()
return None, "Transcoding timed out"
except Exception as e:
if transcode_path.exists():
transcode_path.unlink()
return None, f"Transcoding failed: {e}"
async def get_metadata(self, content_hash: str, actor_id: str) -> Optional[Dict[str, Any]]:
"""Get content metadata."""
if not self.cache.has_content(content_hash):
return None
return await self.db.load_item_metadata(content_hash, actor_id)
async def update_metadata(
self,
content_hash: str,
name: str = None,
actor_id: str,
title: str = None,
description: str = None,
tags: List[str] = None,
) -> bool:
"""Update item metadata."""
return await self.db.update_cache_metadata(
custom: Dict[str, Any] = None,
) -> Tuple[bool, Optional[str]]:
"""Update content metadata. Returns (success, error)."""
if not self.cache.has_content(content_hash):
return False, "Content not found"
# Build update dict
updates = {}
if title is not None:
updates["title"] = title
if description is not None:
updates["description"] = description
if tags is not None:
updates["tags"] = tags
if custom is not None:
updates["custom"] = custom
try:
await self.db.update_item_metadata(content_hash, actor_id, **updates)
return True, None
except Exception as e:
return False, str(e)
async def publish_to_l2(
self,
content_hash: str,
actor_id: str,
l2_server: str,
auth_token: str,
) -> Tuple[Optional[str], Optional[str]]:
"""Publish content to L2 and IPFS. Returns (ipfs_cid, error)."""
if not self.cache.has_content(content_hash):
return None, "Content not found"
# Get IPFS CID
cache_item = await self.db.get_cache_item(content_hash)
ipfs_cid = cache_item.get("ipfs_cid") if cache_item else None
# Get metadata for origin info
meta = await self.db.load_item_metadata(content_hash, actor_id)
origin = meta.get("origin") if meta else None
if not origin or "type" not in origin:
return None, "Origin must be set before publishing"
if not auth_token:
return None, "Authentication token required"
# Call L2 publish-cache endpoint
try:
async with httpx.AsyncClient(timeout=30) as client:
resp = await client.post(
f"{l2_server}/assets/publish-cache",
headers={"Authorization": f"Bearer {auth_token}"},
json={
"content_hash": content_hash,
"ipfs_cid": ipfs_cid,
"asset_name": meta.get("title") or content_hash[:16],
"asset_type": detect_media_type(self.cache.get_content_path(content_hash)),
"origin": origin,
"description": meta.get("description"),
"tags": meta.get("tags", []),
}
)
resp.raise_for_status()
l2_result = resp.json()
except httpx.HTTPStatusError as e:
error_detail = str(e)
try:
error_detail = e.response.json().get("detail", str(e))
except Exception:
pass
return None, f"L2 publish failed: {error_detail}"
except Exception as e:
return None, f"L2 publish failed: {e}"
# Update local metadata with publish status
await self.db.save_l2_share(
content_hash=content_hash,
name=name,
description=description,
tags=tags,
actor_id=actor_id,
l2_server=l2_server,
asset_name=meta.get("title") or content_hash[:16],
content_type=detect_media_type(self.cache.get_content_path(content_hash))
)
await self.db.update_item_metadata(
content_hash=content_hash,
actor_id=actor_id,
pinned=True,
pin_reason="published"
)
async def delete_item(self, content_hash: str) -> bool:
"""Delete a cached item."""
path = self.cache.get_by_content_hash(content_hash)
if path and path.exists():
path.unlink()
return l2_result.get("ipfs_cid") or ipfs_cid, None
# Remove from database
await self.db.delete_cache_item(content_hash)
return True
async def delete_content(self, content_hash: str, actor_id: str) -> Tuple[bool, Optional[str]]:
"""Delete content from cache. Returns (success, error)."""
if not self.cache.has_content(content_hash):
return False, "Content not found"
def has_content(self, content_hash: str) -> bool:
"""Check if content exists in cache."""
return self.cache.has_content(content_hash)
# Check if pinned
meta = await self.db.load_item_metadata(content_hash, actor_id)
if meta and meta.get("pinned"):
pin_reason = meta.get("pin_reason", "unknown")
return False, f"Cannot discard pinned item (reason: {pin_reason})"
def get_ipfs_cid(self, content_hash: str) -> Optional[str]:
"""Get IPFS CID for cached content."""
return self.cache.get_ipfs_cid(content_hash)
# Check deletion rules via cache_manager
can_delete, reason = self.cache.can_delete(content_hash)
if not can_delete:
return False, f"Cannot discard: {reason}"
# Delete via cache_manager
success, msg = self.cache.delete_by_content_hash(content_hash)
# Clean up legacy metadata files
meta_path = self.cache_dir / f"{content_hash}.meta.json"
if meta_path.exists():
meta_path.unlink()
mp4_path = self.cache_dir / f"{content_hash}.mp4"
if mp4_path.exists():
mp4_path.unlink()
return True, None
async def import_from_ipfs(self, ipfs_cid: str, actor_id: str) -> Tuple[Optional[str], Optional[str]]:
"""Import content from IPFS. Returns (content_hash, error)."""
try:
import ipfs_client
# Download from IPFS
legacy_dir = self.cache_dir / "legacy"
legacy_dir.mkdir(parents=True, exist_ok=True)
tmp_path = legacy_dir / f"import-{ipfs_cid[:16]}"
if not ipfs_client.get_file(ipfs_cid, str(tmp_path)):
return None, f"Could not fetch CID {ipfs_cid} from IPFS"
# Store in cache
cached, _ = self.cache.put(tmp_path, node_type="import", move=True)
content_hash = cached.content_hash
# Save to database
await self.db.create_cache_item(content_hash, ipfs_cid)
await self.db.save_item_metadata(
content_hash=content_hash,
actor_id=actor_id,
item_type="media",
filename=f"ipfs-{ipfs_cid[:16]}"
)
return content_hash, None
except Exception as e:
return None, f"Import failed: {e}"
async def upload_content(
self,
content: bytes,
filename: str,
actor_id: str,
) -> Tuple[Optional[str], Optional[str]]:
"""Upload content to cache. Returns (content_hash, error)."""
import tempfile
try:
# Write to temp file
with tempfile.NamedTemporaryFile(delete=False) as tmp:
tmp.write(content)
tmp_path = Path(tmp.name)
# Store in cache
cached, ipfs_cid = self.cache.put(tmp_path, node_type="upload", move=True)
content_hash = cached.content_hash
# Save to database
await self.db.create_cache_item(content_hash, ipfs_cid)
await self.db.save_item_metadata(
content_hash=content_hash,
actor_id=actor_id,
item_type="media",
filename=filename
)
return content_hash, None
except Exception as e:
return None, f"Upload failed: {e}"
async def list_media(
self,
@@ -118,12 +489,20 @@ class CacheService:
media_type: str = None,
) -> List[Dict[str, Any]]:
"""List media items in cache."""
# Use list_items internally, converting offset to page
page = (offset // limit) + 1 if limit > 0 else 1
result = await self.list_items(
# Get items from database
items = await self.db.list_cache_items(
actor_id=actor_id or username,
media_type=media_type,
page=page,
offset=offset,
limit=limit,
)
return result.get("items", [])
return items
# Legacy compatibility methods
def has_content(self, content_hash: str) -> bool:
"""Check if content exists in cache."""
return self.cache.has_content(content_hash)
def get_ipfs_cid(self, content_hash: str) -> Optional[str]:
"""Get IPFS CID for cached content."""
return self.cache.get_ipfs_cid(content_hash)