Add hybrid state manager for distributed L1 coordination

Implements HybridStateManager providing fast local Redis operations
with background IPNS sync for eventual consistency across L1 nodes.

- hybrid_state.py: Centralized state management (cache, claims, analysis, plans, runs)
- Updated execute_cid.py, analyze_cid.py, orchestrate_cid.py to use state manager
- Background IPNS sync (configurable interval, disabled by default)
- Atomic claiming with Redis SETNX for preventing duplicate work

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
gilesb
2026-01-11 09:35:50 +00:00
parent f11cec9d48
commit ca8bfd8705
4 changed files with 319 additions and 67 deletions

View File

@@ -8,33 +8,25 @@ Everything on IPFS:
- Step outputs (media files)
The entire pipeline just passes CIDs around.
Uses HybridStateManager for:
- Fast local Redis operations
- Background IPNS sync with other L1 nodes
"""
import json
import logging
import os
import shutil
import tempfile
from pathlib import Path
from typing import Dict, List, Optional
from celery import current_task, group
from celery import group
import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from celery_app import app
import ipfs_client
# Redis for caching
import redis
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/5")
_redis: Optional[redis.Redis] = None
def get_redis() -> redis.Redis:
global _redis
if _redis is None:
_redis = redis.from_url(REDIS_URL, decode_responses=True)
return _redis
from hybrid_state import get_state_manager
# Import artdag modules
try:
@@ -53,11 +45,6 @@ from .execute_cid import execute_step_cid
logger = logging.getLogger(__name__)
# Redis keys
PLAN_CACHE_KEY = "artdag:plan_cid" # hash: plan_id → plan CID
RECIPE_CACHE_KEY = "artdag:recipe_cid" # hash: recipe_hash → recipe CID
RUN_CACHE_KEY = "artdag:run_cid" # hash: run_id → output CID
def compute_run_id(recipe_cid: str, input_cids: Dict[str, str]) -> str:
"""Compute deterministic run ID from recipe and inputs."""
@@ -203,7 +190,7 @@ def generate_plan_cid(
return {"status": "failed", "error": "Failed to store plan on IPFS"}
# Cache plan_id → plan_cid mapping
get_redis().hset(PLAN_CACHE_KEY, plan.plan_id, plan_cid)
get_state_manager().set_plan_cid(plan.plan_id, plan_cid)
logger.info(f"[CID] Generated plan: {plan.plan_id[:16]}... → {plan_cid}")
@@ -327,7 +314,7 @@ def run_recipe_cid(
run_id = compute_run_id(recipe_cid, input_cids)
# Check if run is already cached
cached_output = get_redis().hget(RUN_CACHE_KEY, run_id)
cached_output = get_state_manager().get_run_cid(run_id)
if cached_output:
logger.info(f"[CID] Run cache hit: {run_id[:16]}... → {cached_output}")
return {
@@ -385,7 +372,7 @@ def run_recipe_cid(
output_cid = exec_result["output_cid"]
# Cache the run
get_redis().hset(RUN_CACHE_KEY, run_id, output_cid)
get_state_manager().set_run_cid(run_id, output_cid)
logger.info(f"[CID] Run complete: {run_id[:16]}... → {output_cid}")