Add hybrid state manager for distributed L1 coordination

Implements HybridStateManager providing fast local Redis operations
with background IPNS sync for eventual consistency across L1 nodes.

- hybrid_state.py: Centralized state management (cache, claims, analysis, plans, runs)
- Updated execute_cid.py, analyze_cid.py, orchestrate_cid.py to use state manager
- Background IPNS sync (configurable interval, disabled by default)
- Atomic claiming with Redis SETNX for preventing duplicate work

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
gilesb
2026-01-11 09:35:50 +00:00
parent f11cec9d48
commit ca8bfd8705
4 changed files with 319 additions and 67 deletions

View File

@@ -3,6 +3,10 @@ Simplified step execution with IPFS-primary architecture.
Steps receive CIDs, produce CIDs. No file paths cross machine boundaries.
IPFS nodes form a distributed cache automatically.
Uses HybridStateManager for:
- Fast local Redis operations
- Background IPNS sync with other L1 nodes
"""
import logging
@@ -19,17 +23,7 @@ import sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from celery_app import app
import ipfs_client
# Redis for claiming and cache_id → CID mapping
import redis
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/5")
_redis: Optional[redis.Redis] = None
def get_redis() -> redis.Redis:
global _redis
if _redis is None:
_redis = redis.from_url(REDIS_URL, decode_responses=True)
return _redis
from hybrid_state import get_state_manager
# Import artdag
try:
@@ -44,10 +38,6 @@ except ImportError:
logger = logging.getLogger(__name__)
# Redis keys
CACHE_KEY = "artdag:cid_cache" # hash: cache_id → CID
CLAIM_KEY_PREFIX = "artdag:claim:" # string: cache_id → worker_id
def get_worker_id() -> str:
"""Get unique worker identifier."""
@@ -56,24 +46,22 @@ def get_worker_id() -> str:
def get_cached_cid(cache_id: str) -> Optional[str]:
"""Check if cache_id has a known CID."""
return get_redis().hget(CACHE_KEY, cache_id)
return get_state_manager().get_cached_cid(cache_id)
def set_cached_cid(cache_id: str, cid: str) -> None:
"""Store cache_id → CID mapping."""
get_redis().hset(CACHE_KEY, cache_id, cid)
get_state_manager().set_cached_cid(cache_id, cid)
def try_claim(cache_id: str, worker_id: str, ttl: int = 300) -> bool:
"""Try to claim a cache_id for execution. Returns True if claimed."""
key = f"{CLAIM_KEY_PREFIX}{cache_id}"
return get_redis().set(key, worker_id, nx=True, ex=ttl)
return get_state_manager().try_claim(cache_id, worker_id, ttl)
def release_claim(cache_id: str) -> None:
"""Release a claim."""
key = f"{CLAIM_KEY_PREFIX}{cache_id}"
get_redis().delete(key)
get_state_manager().release_claim(cache_id)
def wait_for_cid(cache_id: str, timeout: int = 600, poll_interval: float = 0.5) -> Optional[str]: