Add at-least-once delivery + idempotent federation handler

- EventProcessor now recovers stuck "processing" activities back to
  "pending" after 5 minutes (handles process crashes)
- New ap_delivery_log table records successful inbox deliveries
- Federation delivery handler checks the log before sending, so
  retries skip already-delivered inboxes
- Together these give at-least-once + idempotent semantics

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
giles
2026-02-23 16:16:55 +00:00
parent 8951a62b90
commit 9cb8cf9e1d
4 changed files with 140 additions and 12 deletions

View File

@@ -15,10 +15,10 @@ from __future__ import annotations
import asyncio
import logging
import traceback
from datetime import datetime, timezone
from datetime import datetime, timedelta, timezone
import asyncpg
from sqlalchemy import select
from sqlalchemy import select, update
from sqlalchemy.ext.asyncio import AsyncSession
from shared.db.session import get_session, DATABASE_URL
@@ -37,15 +37,18 @@ class EventProcessor:
app_name: str | None = None,
poll_interval: float = 2.0,
batch_size: int = 10,
stuck_timeout: float = 300.0,
):
self._app_name = app_name
self._poll_interval = poll_interval
self._batch_size = batch_size
self._stuck_timeout = stuck_timeout # seconds before "processing" → "pending"
self._task: asyncio.Task | None = None
self._listen_task: asyncio.Task | None = None
self._listen_conn: asyncpg.Connection | None = None
self._wake = asyncio.Event()
self._running = False
self._reap_counter = 0
# ------------------------------------------------------------------
# Lifecycle
@@ -119,6 +122,12 @@ class EventProcessor:
async def _poll_loop(self) -> None:
while self._running:
try:
# Periodically recover stuck activities (~every 30 cycles)
self._reap_counter += 1
if self._reap_counter >= 30:
self._reap_counter = 0
await self._recover_stuck()
# Clear before processing so any NOTIFY that arrives during
# _process_batch sets the event and we loop immediately.
self._wake.clear()
@@ -137,6 +146,37 @@ class EventProcessor:
traceback.print_exc()
await asyncio.sleep(self._poll_interval)
async def _recover_stuck(self) -> None:
"""Reset activities stuck in 'processing' back to 'pending'.
This handles the case where a process crashed mid-handler.
Combined with idempotent handlers, this gives at-least-once delivery.
"""
cutoff = datetime.now(timezone.utc) - timedelta(seconds=self._stuck_timeout)
try:
async with get_session() as session:
filters = [
APActivity.process_state == "processing",
APActivity.created_at < cutoff,
]
if self._app_name:
filters.append(APActivity.origin_app == self._app_name)
result = await session.execute(
update(APActivity)
.where(*filters)
.values(process_state="pending")
.returning(APActivity.id)
)
recovered = result.scalars().all()
await session.commit()
if recovered:
log.warning(
"Recovered %d stuck activities: %s",
len(recovered), recovered,
)
except Exception:
log.exception("Failed to recover stuck activities")
async def _process_batch(self) -> int:
"""Fetch and process a batch of pending activities. Returns count processed."""
processed = 0