Add hybrid state manager for distributed L1 coordination

Implements HybridStateManager providing fast local Redis operations with background IPNS sync for eventual consistency across L1 nodes. - hybrid_state.py: Centralized state management (cache, claims, analysis, plans, runs) - Updated execute_cid.py, analyze_cid.py, orchestrate_cid.py to use state manager - Background IPNS sync (configurable interval, disabled by default) - Atomic claiming with Redis SETNX for preventing duplicate work Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-11 09:35:50 +00:00
parent f11cec9d48
commit ca8bfd8705
4 changed files with 319 additions and 67 deletions
--- a/tasks/analyze_cid.py
+++ b/tasks/analyze_cid.py
@@ -2,6 +2,10 @@
 IPFS-primary analysis tasks.

 Fetches inputs from IPFS, stores analysis results on IPFS.
+
+Uses HybridStateManager for:
+- Fast local Redis operations
+- Background IPNS sync with other L1 nodes
 """

 import json
@@ -18,17 +22,7 @@ import sys
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from celery_app import app
 import ipfs_client
-
-# Redis for caching analysis CIDs
-import redis
-REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/5")
-_redis: Optional[redis.Redis] = None
-
-def get_redis() -> redis.Redis:
-    global _redis
-    if _redis is None:
-        _redis = redis.from_url(REDIS_URL, decode_responses=True)
-    return _redis
+from hybrid_state import get_state_manager

 # Import artdag analysis module
 try:
@@ -39,26 +33,15 @@ except ImportError:

 logger = logging.getLogger(__name__)

-# Redis key for analysis cache
-ANALYSIS_CACHE_KEY = "artdag:analysis_cid"  # hash: input_hash:features → analysis CID
-
-
-def get_analysis_cache_key(input_hash: str, features: List[str]) -> str:
-    """Generate cache key for analysis results."""
-    features_str = ",".join(sorted(features))
-    return f"{input_hash}:{features_str}"
-

 def get_cached_analysis_cid(input_hash: str, features: List[str]) -> Optional[str]:
    """Check if analysis is already cached."""
-    key = get_analysis_cache_key(input_hash, features)
-    return get_redis().hget(ANALYSIS_CACHE_KEY, key)
+    return get_state_manager().get_analysis_cid(input_hash, features)


 def set_cached_analysis_cid(input_hash: str, features: List[str], cid: str) -> None:
    """Store analysis CID in cache."""
-    key = get_analysis_cache_key(input_hash, features)
-    get_redis().hset(ANALYSIS_CACHE_KEY, key, cid)
+    get_state_manager().set_analysis_cid(input_hash, features, cid)


@app.task(bind=True, name='tasks.analyze_input_cid')
--- a/tasks/execute_cid.py
+++ b/tasks/execute_cid.py
@@ -3,6 +3,10 @@ Simplified step execution with IPFS-primary architecture.

 Steps receive CIDs, produce CIDs. No file paths cross machine boundaries.
 IPFS nodes form a distributed cache automatically.
+
+Uses HybridStateManager for:
+- Fast local Redis operations
+- Background IPNS sync with other L1 nodes
 """

 import logging
@@ -19,17 +23,7 @@ import sys
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from celery_app import app
 import ipfs_client
-
-# Redis for claiming and cache_id → CID mapping
-import redis
-REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/5")
-_redis: Optional[redis.Redis] = None
-
-def get_redis() -> redis.Redis:
-    global _redis
-    if _redis is None:
-        _redis = redis.from_url(REDIS_URL, decode_responses=True)
-    return _redis
+from hybrid_state import get_state_manager

 # Import artdag
 try:
@@ -44,10 +38,6 @@ except ImportError:

 logger = logging.getLogger(__name__)

-# Redis keys
-CACHE_KEY = "artdag:cid_cache"  # hash: cache_id → CID
-CLAIM_KEY_PREFIX = "artdag:claim:"  # string: cache_id → worker_id
-

 def get_worker_id() -> str:
    """Get unique worker identifier."""
@@ -56,24 +46,22 @@ def get_worker_id() -> str:

 def get_cached_cid(cache_id: str) -> Optional[str]:
    """Check if cache_id has a known CID."""
-    return get_redis().hget(CACHE_KEY, cache_id)
+    return get_state_manager().get_cached_cid(cache_id)


 def set_cached_cid(cache_id: str, cid: str) -> None:
    """Store cache_id → CID mapping."""
-    get_redis().hset(CACHE_KEY, cache_id, cid)
+    get_state_manager().set_cached_cid(cache_id, cid)


 def try_claim(cache_id: str, worker_id: str, ttl: int = 300) -> bool:
    """Try to claim a cache_id for execution. Returns True if claimed."""
-    key = f"{CLAIM_KEY_PREFIX}{cache_id}"
-    return get_redis().set(key, worker_id, nx=True, ex=ttl)
+    return get_state_manager().try_claim(cache_id, worker_id, ttl)


 def release_claim(cache_id: str) -> None:
    """Release a claim."""
-    key = f"{CLAIM_KEY_PREFIX}{cache_id}"
-    get_redis().delete(key)
+    get_state_manager().release_claim(cache_id)


 def wait_for_cid(cache_id: str, timeout: int = 600, poll_interval: float = 0.5) -> Optional[str]:
--- a/tasks/orchestrate_cid.py
+++ b/tasks/orchestrate_cid.py
@@ -8,33 +8,25 @@ Everything on IPFS:
 - Step outputs (media files)

 The entire pipeline just passes CIDs around.
+
+Uses HybridStateManager for:
+- Fast local Redis operations
+- Background IPNS sync with other L1 nodes
 """

 import json
 import logging
 import os
-import shutil
-import tempfile
 from pathlib import Path
 from typing import Dict, List, Optional

-from celery import current_task, group
+from celery import group

 import sys
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from celery_app import app
 import ipfs_client
-
-# Redis for caching
-import redis
-REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/5")
-_redis: Optional[redis.Redis] = None
-
-def get_redis() -> redis.Redis:
-    global _redis
-    if _redis is None:
-        _redis = redis.from_url(REDIS_URL, decode_responses=True)
-    return _redis
+from hybrid_state import get_state_manager

 # Import artdag modules
 try:
@@ -53,11 +45,6 @@ from .execute_cid import execute_step_cid

 logger = logging.getLogger(__name__)

-# Redis keys
-PLAN_CACHE_KEY = "artdag:plan_cid"      # hash: plan_id → plan CID
-RECIPE_CACHE_KEY = "artdag:recipe_cid"  # hash: recipe_hash → recipe CID
-RUN_CACHE_KEY = "artdag:run_cid"        # hash: run_id → output CID
-

 def compute_run_id(recipe_cid: str, input_cids: Dict[str, str]) -> str:
    """Compute deterministic run ID from recipe and inputs."""
@@ -203,7 +190,7 @@ def generate_plan_cid(
        return {"status": "failed", "error": "Failed to store plan on IPFS"}

    # Cache plan_id → plan_cid mapping
-    get_redis().hset(PLAN_CACHE_KEY, plan.plan_id, plan_cid)
+    get_state_manager().set_plan_cid(plan.plan_id, plan_cid)

    logger.info(f"[CID] Generated plan: {plan.plan_id[:16]}... → {plan_cid}")

@@ -327,7 +314,7 @@ def run_recipe_cid(
    run_id = compute_run_id(recipe_cid, input_cids)

    # Check if run is already cached
-    cached_output = get_redis().hget(RUN_CACHE_KEY, run_id)
+    cached_output = get_state_manager().get_run_cid(run_id)
    if cached_output:
        logger.info(f"[CID] Run cache hit: {run_id[:16]}... → {cached_output}")
        return {
@@ -385,7 +372,7 @@ def run_recipe_cid(
    output_cid = exec_result["output_cid"]

    # Cache the run
-    get_redis().hset(RUN_CACHE_KEY, run_id, output_cid)
+    get_state_manager().set_run_cid(run_id, output_cid)

    logger.info(f"[CID] Run complete: {run_id[:16]}... → {output_cid}")