Files
rose-ash/artdag/activities.py
giles cc2dcbddd4 Squashed 'core/' content from commit 4957443
git-subtree-dir: core
git-subtree-split: 4957443184ae0eb6323635a90a19acffb3e01d07
2026-02-24 23:09:39 +00:00

372 lines
12 KiB
Python

# artdag/activities.py
"""
Persistent activity (job) tracking for cache management.
Activities represent executions of DAGs. They track:
- Input node IDs (sources)
- Output node ID (terminal node)
- Intermediate node IDs (everything in between)
This enables deletion rules:
- Shared items (ActivityPub published) cannot be deleted
- Inputs/outputs of activities cannot be deleted
- Intermediates can be deleted (reconstructible)
- Activities can only be discarded if no items are shared
"""
import json
import logging
import time
import uuid
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Set
from .cache import Cache, CacheEntry
from .dag import DAG
logger = logging.getLogger(__name__)
def make_is_shared_fn(activitypub_store: "ActivityStore") -> Callable[[str], bool]:
"""
Create an is_shared function from an ActivityPub ActivityStore.
Args:
activitypub_store: The ActivityPub activity store
(from artdag.activitypub.activity)
Returns:
Function that checks if a cid has been published
"""
def is_shared(cid: str) -> bool:
activities = activitypub_store.find_by_object_hash(cid)
return any(a.activity_type == "Create" for a in activities)
return is_shared
@dataclass
class Activity:
"""
A recorded execution of a DAG.
Tracks which cache entries are inputs, outputs, and intermediates
to enforce deletion rules.
"""
activity_id: str
input_ids: List[str] # Source node cache IDs
output_id: str # Terminal node cache ID
intermediate_ids: List[str] # Everything in between
created_at: float
status: str = "completed" # pending|running|completed|failed
dag_snapshot: Optional[Dict[str, Any]] = None # Serialized DAG for reconstruction
def to_dict(self) -> Dict[str, Any]:
return {
"activity_id": self.activity_id,
"input_ids": self.input_ids,
"output_id": self.output_id,
"intermediate_ids": self.intermediate_ids,
"created_at": self.created_at,
"status": self.status,
"dag_snapshot": self.dag_snapshot,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "Activity":
return cls(
activity_id=data["activity_id"],
input_ids=data["input_ids"],
output_id=data["output_id"],
intermediate_ids=data["intermediate_ids"],
created_at=data["created_at"],
status=data.get("status", "completed"),
dag_snapshot=data.get("dag_snapshot"),
)
@classmethod
def from_dag(cls, dag: DAG, activity_id: str = None) -> "Activity":
"""
Create an Activity from a DAG.
Classifies nodes as inputs, output, or intermediates.
"""
if activity_id is None:
activity_id = str(uuid.uuid4())
# Find input nodes (nodes with no inputs - sources)
input_ids = []
for node_id, node in dag.nodes.items():
if not node.inputs:
input_ids.append(node_id)
# Output is the terminal node
output_id = dag.output_id
# Intermediates are everything else
intermediate_ids = []
for node_id in dag.nodes:
if node_id not in input_ids and node_id != output_id:
intermediate_ids.append(node_id)
return cls(
activity_id=activity_id,
input_ids=sorted(input_ids),
output_id=output_id,
intermediate_ids=sorted(intermediate_ids),
created_at=time.time(),
status="completed",
dag_snapshot=dag.to_dict(),
)
@property
def all_node_ids(self) -> List[str]:
"""All node IDs involved in this activity."""
return self.input_ids + [self.output_id] + self.intermediate_ids
class ActivityStore:
"""
Persistent storage for activities.
Provides methods to check deletion eligibility and perform deletions.
"""
def __init__(self, store_dir: Path | str):
self.store_dir = Path(store_dir)
self.store_dir.mkdir(parents=True, exist_ok=True)
self._activities: Dict[str, Activity] = {}
self._load()
def _index_path(self) -> Path:
return self.store_dir / "activities.json"
def _load(self):
"""Load activities from disk."""
index_path = self._index_path()
if index_path.exists():
try:
with open(index_path) as f:
data = json.load(f)
self._activities = {
a["activity_id"]: Activity.from_dict(a)
for a in data.get("activities", [])
}
except (json.JSONDecodeError, KeyError) as e:
logger.warning(f"Failed to load activities: {e}")
self._activities = {}
def _save(self):
"""Save activities to disk."""
data = {
"version": "1.0",
"activities": [a.to_dict() for a in self._activities.values()],
}
with open(self._index_path(), "w") as f:
json.dump(data, f, indent=2)
def add(self, activity: Activity) -> None:
"""Add an activity."""
self._activities[activity.activity_id] = activity
self._save()
def get(self, activity_id: str) -> Optional[Activity]:
"""Get an activity by ID."""
return self._activities.get(activity_id)
def remove(self, activity_id: str) -> bool:
"""Remove an activity record (does not delete cache entries)."""
if activity_id not in self._activities:
return False
del self._activities[activity_id]
self._save()
return True
def list(self) -> List[Activity]:
"""List all activities."""
return list(self._activities.values())
def find_by_input_ids(self, input_ids: List[str]) -> List[Activity]:
"""Find activities with the same inputs (for UI grouping)."""
sorted_inputs = sorted(input_ids)
return [
a for a in self._activities.values()
if sorted(a.input_ids) == sorted_inputs
]
def find_using_node(self, node_id: str) -> List[Activity]:
"""Find all activities that reference a node ID."""
return [
a for a in self._activities.values()
if node_id in a.all_node_ids
]
def __len__(self) -> int:
return len(self._activities)
class ActivityManager:
"""
Manages activities and cache deletion with sharing rules.
Deletion rules:
1. Shared items (ActivityPub published) cannot be deleted
2. Inputs/outputs of activities cannot be deleted
3. Intermediates can be deleted (reconstructible)
4. Activities can only be discarded if no items are shared
"""
def __init__(
self,
cache: Cache,
activity_store: ActivityStore,
is_shared_fn: Callable[[str], bool],
):
"""
Args:
cache: The L1 cache
activity_store: Activity persistence
is_shared_fn: Function that checks if a cid is shared
(published via ActivityPub)
"""
self.cache = cache
self.activities = activity_store
self._is_shared = is_shared_fn
def record_activity(self, dag: DAG) -> Activity:
"""Record a completed DAG execution as an activity."""
activity = Activity.from_dag(dag)
self.activities.add(activity)
return activity
def is_shared(self, node_id: str) -> bool:
"""Check if a cache entry is shared (published via ActivityPub)."""
entry = self.cache.get_entry(node_id)
if not entry or not entry.cid:
return False
return self._is_shared(entry.cid)
def can_delete_cache_entry(self, node_id: str) -> bool:
"""
Check if a cache entry can be deleted.
Returns False if:
- Entry is shared (ActivityPub published)
- Entry is an input or output of any activity
"""
# Check if shared
if self.is_shared(node_id):
return False
# Check if it's an input or output of any activity
for activity in self.activities.list():
if node_id in activity.input_ids:
return False
if node_id == activity.output_id:
return False
# It's either an intermediate or orphaned - can delete
return True
def can_discard_activity(self, activity_id: str) -> bool:
"""
Check if an activity can be discarded.
Returns False if any cache entry (input, output, or intermediate)
is shared via ActivityPub.
"""
activity = self.activities.get(activity_id)
if not activity:
return False
# Check if any item is shared
for node_id in activity.all_node_ids:
if self.is_shared(node_id):
return False
return True
def discard_activity(self, activity_id: str) -> bool:
"""
Discard an activity and delete its intermediate cache entries.
Returns False if the activity cannot be discarded (has shared items).
When discarded:
- Intermediate cache entries are deleted
- The activity record is removed
- Inputs remain (may be used by other activities)
- Output is deleted if orphaned (not shared, not used elsewhere)
"""
if not self.can_discard_activity(activity_id):
return False
activity = self.activities.get(activity_id)
if not activity:
return False
output_id = activity.output_id
intermediate_ids = list(activity.intermediate_ids)
# Remove the activity record first
self.activities.remove(activity_id)
# Delete intermediates
for node_id in intermediate_ids:
self.cache.remove(node_id)
logger.debug(f"Deleted intermediate: {node_id}")
# Check if output is now orphaned
if self._is_orphaned(output_id) and not self.is_shared(output_id):
self.cache.remove(output_id)
logger.debug(f"Deleted orphaned output: {output_id}")
# Inputs remain - they may be used by other activities
# But check if any are orphaned now
for input_id in activity.input_ids:
if self._is_orphaned(input_id) and not self.is_shared(input_id):
self.cache.remove(input_id)
logger.debug(f"Deleted orphaned input: {input_id}")
return True
def _is_orphaned(self, node_id: str) -> bool:
"""Check if a node is not referenced by any activity."""
for activity in self.activities.list():
if node_id in activity.all_node_ids:
return False
return True
def get_deletable_entries(self) -> List[CacheEntry]:
"""Get all cache entries that can be deleted."""
deletable = []
for entry in self.cache.list_entries():
if self.can_delete_cache_entry(entry.node_id):
deletable.append(entry)
return deletable
def get_discardable_activities(self) -> List[Activity]:
"""Get all activities that can be discarded."""
return [
a for a in self.activities.list()
if self.can_discard_activity(a.activity_id)
]
def cleanup_intermediates(self) -> int:
"""
Delete all intermediate cache entries.
Intermediates are safe to delete as they can be reconstructed
from inputs using the DAG.
Returns:
Number of entries deleted
"""
deleted = 0
for activity in self.activities.list():
for node_id in activity.intermediate_ids:
if self.cache.has(node_id):
self.cache.remove(node_id)
deleted += 1
return deleted