372 lines
12 KiB
Python
372 lines
12 KiB
Python
# artdag/activities.py
|
|
"""
|
|
Persistent activity (job) tracking for cache management.
|
|
|
|
Activities represent executions of DAGs. They track:
|
|
- Input node IDs (sources)
|
|
- Output node ID (terminal node)
|
|
- Intermediate node IDs (everything in between)
|
|
|
|
This enables deletion rules:
|
|
- Shared items (ActivityPub published) cannot be deleted
|
|
- Inputs/outputs of activities cannot be deleted
|
|
- Intermediates can be deleted (reconstructible)
|
|
- Activities can only be discarded if no items are shared
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import time
|
|
import uuid
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import Any, Callable, Dict, List, Optional, Set
|
|
|
|
from .cache import Cache, CacheEntry
|
|
from .dag import DAG
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def make_is_shared_fn(activitypub_store: "ActivityStore") -> Callable[[str], bool]:
|
|
"""
|
|
Create an is_shared function from an ActivityPub ActivityStore.
|
|
|
|
Args:
|
|
activitypub_store: The ActivityPub activity store
|
|
(from artdag.activitypub.activity)
|
|
|
|
Returns:
|
|
Function that checks if a cid has been published
|
|
"""
|
|
def is_shared(cid: str) -> bool:
|
|
activities = activitypub_store.find_by_object_hash(cid)
|
|
return any(a.activity_type == "Create" for a in activities)
|
|
return is_shared
|
|
|
|
|
|
@dataclass
|
|
class Activity:
|
|
"""
|
|
A recorded execution of a DAG.
|
|
|
|
Tracks which cache entries are inputs, outputs, and intermediates
|
|
to enforce deletion rules.
|
|
"""
|
|
activity_id: str
|
|
input_ids: List[str] # Source node cache IDs
|
|
output_id: str # Terminal node cache ID
|
|
intermediate_ids: List[str] # Everything in between
|
|
created_at: float
|
|
status: str = "completed" # pending|running|completed|failed
|
|
dag_snapshot: Optional[Dict[str, Any]] = None # Serialized DAG for reconstruction
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {
|
|
"activity_id": self.activity_id,
|
|
"input_ids": self.input_ids,
|
|
"output_id": self.output_id,
|
|
"intermediate_ids": self.intermediate_ids,
|
|
"created_at": self.created_at,
|
|
"status": self.status,
|
|
"dag_snapshot": self.dag_snapshot,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> "Activity":
|
|
return cls(
|
|
activity_id=data["activity_id"],
|
|
input_ids=data["input_ids"],
|
|
output_id=data["output_id"],
|
|
intermediate_ids=data["intermediate_ids"],
|
|
created_at=data["created_at"],
|
|
status=data.get("status", "completed"),
|
|
dag_snapshot=data.get("dag_snapshot"),
|
|
)
|
|
|
|
@classmethod
|
|
def from_dag(cls, dag: DAG, activity_id: str = None) -> "Activity":
|
|
"""
|
|
Create an Activity from a DAG.
|
|
|
|
Classifies nodes as inputs, output, or intermediates.
|
|
"""
|
|
if activity_id is None:
|
|
activity_id = str(uuid.uuid4())
|
|
|
|
# Find input nodes (nodes with no inputs - sources)
|
|
input_ids = []
|
|
for node_id, node in dag.nodes.items():
|
|
if not node.inputs:
|
|
input_ids.append(node_id)
|
|
|
|
# Output is the terminal node
|
|
output_id = dag.output_id
|
|
|
|
# Intermediates are everything else
|
|
intermediate_ids = []
|
|
for node_id in dag.nodes:
|
|
if node_id not in input_ids and node_id != output_id:
|
|
intermediate_ids.append(node_id)
|
|
|
|
return cls(
|
|
activity_id=activity_id,
|
|
input_ids=sorted(input_ids),
|
|
output_id=output_id,
|
|
intermediate_ids=sorted(intermediate_ids),
|
|
created_at=time.time(),
|
|
status="completed",
|
|
dag_snapshot=dag.to_dict(),
|
|
)
|
|
|
|
@property
|
|
def all_node_ids(self) -> List[str]:
|
|
"""All node IDs involved in this activity."""
|
|
return self.input_ids + [self.output_id] + self.intermediate_ids
|
|
|
|
|
|
class ActivityStore:
|
|
"""
|
|
Persistent storage for activities.
|
|
|
|
Provides methods to check deletion eligibility and perform deletions.
|
|
"""
|
|
|
|
def __init__(self, store_dir: Path | str):
|
|
self.store_dir = Path(store_dir)
|
|
self.store_dir.mkdir(parents=True, exist_ok=True)
|
|
self._activities: Dict[str, Activity] = {}
|
|
self._load()
|
|
|
|
def _index_path(self) -> Path:
|
|
return self.store_dir / "activities.json"
|
|
|
|
def _load(self):
|
|
"""Load activities from disk."""
|
|
index_path = self._index_path()
|
|
if index_path.exists():
|
|
try:
|
|
with open(index_path) as f:
|
|
data = json.load(f)
|
|
self._activities = {
|
|
a["activity_id"]: Activity.from_dict(a)
|
|
for a in data.get("activities", [])
|
|
}
|
|
except (json.JSONDecodeError, KeyError) as e:
|
|
logger.warning(f"Failed to load activities: {e}")
|
|
self._activities = {}
|
|
|
|
def _save(self):
|
|
"""Save activities to disk."""
|
|
data = {
|
|
"version": "1.0",
|
|
"activities": [a.to_dict() for a in self._activities.values()],
|
|
}
|
|
with open(self._index_path(), "w") as f:
|
|
json.dump(data, f, indent=2)
|
|
|
|
def add(self, activity: Activity) -> None:
|
|
"""Add an activity."""
|
|
self._activities[activity.activity_id] = activity
|
|
self._save()
|
|
|
|
def get(self, activity_id: str) -> Optional[Activity]:
|
|
"""Get an activity by ID."""
|
|
return self._activities.get(activity_id)
|
|
|
|
def remove(self, activity_id: str) -> bool:
|
|
"""Remove an activity record (does not delete cache entries)."""
|
|
if activity_id not in self._activities:
|
|
return False
|
|
del self._activities[activity_id]
|
|
self._save()
|
|
return True
|
|
|
|
def list(self) -> List[Activity]:
|
|
"""List all activities."""
|
|
return list(self._activities.values())
|
|
|
|
def find_by_input_ids(self, input_ids: List[str]) -> List[Activity]:
|
|
"""Find activities with the same inputs (for UI grouping)."""
|
|
sorted_inputs = sorted(input_ids)
|
|
return [
|
|
a for a in self._activities.values()
|
|
if sorted(a.input_ids) == sorted_inputs
|
|
]
|
|
|
|
def find_using_node(self, node_id: str) -> List[Activity]:
|
|
"""Find all activities that reference a node ID."""
|
|
return [
|
|
a for a in self._activities.values()
|
|
if node_id in a.all_node_ids
|
|
]
|
|
|
|
def __len__(self) -> int:
|
|
return len(self._activities)
|
|
|
|
|
|
class ActivityManager:
|
|
"""
|
|
Manages activities and cache deletion with sharing rules.
|
|
|
|
Deletion rules:
|
|
1. Shared items (ActivityPub published) cannot be deleted
|
|
2. Inputs/outputs of activities cannot be deleted
|
|
3. Intermediates can be deleted (reconstructible)
|
|
4. Activities can only be discarded if no items are shared
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
cache: Cache,
|
|
activity_store: ActivityStore,
|
|
is_shared_fn: Callable[[str], bool],
|
|
):
|
|
"""
|
|
Args:
|
|
cache: The L1 cache
|
|
activity_store: Activity persistence
|
|
is_shared_fn: Function that checks if a cid is shared
|
|
(published via ActivityPub)
|
|
"""
|
|
self.cache = cache
|
|
self.activities = activity_store
|
|
self._is_shared = is_shared_fn
|
|
|
|
def record_activity(self, dag: DAG) -> Activity:
|
|
"""Record a completed DAG execution as an activity."""
|
|
activity = Activity.from_dag(dag)
|
|
self.activities.add(activity)
|
|
return activity
|
|
|
|
def is_shared(self, node_id: str) -> bool:
|
|
"""Check if a cache entry is shared (published via ActivityPub)."""
|
|
entry = self.cache.get_entry(node_id)
|
|
if not entry or not entry.cid:
|
|
return False
|
|
return self._is_shared(entry.cid)
|
|
|
|
def can_delete_cache_entry(self, node_id: str) -> bool:
|
|
"""
|
|
Check if a cache entry can be deleted.
|
|
|
|
Returns False if:
|
|
- Entry is shared (ActivityPub published)
|
|
- Entry is an input or output of any activity
|
|
"""
|
|
# Check if shared
|
|
if self.is_shared(node_id):
|
|
return False
|
|
|
|
# Check if it's an input or output of any activity
|
|
for activity in self.activities.list():
|
|
if node_id in activity.input_ids:
|
|
return False
|
|
if node_id == activity.output_id:
|
|
return False
|
|
|
|
# It's either an intermediate or orphaned - can delete
|
|
return True
|
|
|
|
def can_discard_activity(self, activity_id: str) -> bool:
|
|
"""
|
|
Check if an activity can be discarded.
|
|
|
|
Returns False if any cache entry (input, output, or intermediate)
|
|
is shared via ActivityPub.
|
|
"""
|
|
activity = self.activities.get(activity_id)
|
|
if not activity:
|
|
return False
|
|
|
|
# Check if any item is shared
|
|
for node_id in activity.all_node_ids:
|
|
if self.is_shared(node_id):
|
|
return False
|
|
|
|
return True
|
|
|
|
def discard_activity(self, activity_id: str) -> bool:
|
|
"""
|
|
Discard an activity and delete its intermediate cache entries.
|
|
|
|
Returns False if the activity cannot be discarded (has shared items).
|
|
|
|
When discarded:
|
|
- Intermediate cache entries are deleted
|
|
- The activity record is removed
|
|
- Inputs remain (may be used by other activities)
|
|
- Output is deleted if orphaned (not shared, not used elsewhere)
|
|
"""
|
|
if not self.can_discard_activity(activity_id):
|
|
return False
|
|
|
|
activity = self.activities.get(activity_id)
|
|
if not activity:
|
|
return False
|
|
|
|
output_id = activity.output_id
|
|
intermediate_ids = list(activity.intermediate_ids)
|
|
|
|
# Remove the activity record first
|
|
self.activities.remove(activity_id)
|
|
|
|
# Delete intermediates
|
|
for node_id in intermediate_ids:
|
|
self.cache.remove(node_id)
|
|
logger.debug(f"Deleted intermediate: {node_id}")
|
|
|
|
# Check if output is now orphaned
|
|
if self._is_orphaned(output_id) and not self.is_shared(output_id):
|
|
self.cache.remove(output_id)
|
|
logger.debug(f"Deleted orphaned output: {output_id}")
|
|
|
|
# Inputs remain - they may be used by other activities
|
|
# But check if any are orphaned now
|
|
for input_id in activity.input_ids:
|
|
if self._is_orphaned(input_id) and not self.is_shared(input_id):
|
|
self.cache.remove(input_id)
|
|
logger.debug(f"Deleted orphaned input: {input_id}")
|
|
|
|
return True
|
|
|
|
def _is_orphaned(self, node_id: str) -> bool:
|
|
"""Check if a node is not referenced by any activity."""
|
|
for activity in self.activities.list():
|
|
if node_id in activity.all_node_ids:
|
|
return False
|
|
return True
|
|
|
|
def get_deletable_entries(self) -> List[CacheEntry]:
|
|
"""Get all cache entries that can be deleted."""
|
|
deletable = []
|
|
for entry in self.cache.list_entries():
|
|
if self.can_delete_cache_entry(entry.node_id):
|
|
deletable.append(entry)
|
|
return deletable
|
|
|
|
def get_discardable_activities(self) -> List[Activity]:
|
|
"""Get all activities that can be discarded."""
|
|
return [
|
|
a for a in self.activities.list()
|
|
if self.can_discard_activity(a.activity_id)
|
|
]
|
|
|
|
def cleanup_intermediates(self) -> int:
|
|
"""
|
|
Delete all intermediate cache entries.
|
|
|
|
Intermediates are safe to delete as they can be reconstructed
|
|
from inputs using the DAG.
|
|
|
|
Returns:
|
|
Number of entries deleted
|
|
"""
|
|
deleted = 0
|
|
for activity in self.activities.list():
|
|
for node_id in activity.intermediate_ids:
|
|
if self.cache.has(node_id):
|
|
self.cache.remove(node_id)
|
|
deleted += 1
|
|
return deleted
|