Add at-least-once delivery + idempotent federation handler
- EventProcessor now recovers stuck "processing" activities back to "pending" after 5 minutes (handles process crashes) - New ap_delivery_log table records successful inbox deliveries - Federation delivery handler checks the log before sending, so retries skip already-delivered inboxes - Together these give at-least-once + idempotent semantics Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -15,10 +15,10 @@ from __future__ import annotations
|
||||
import asyncio
|
||||
import logging
|
||||
import traceback
|
||||
from datetime import datetime, timezone
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
import asyncpg
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy import select, update
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from shared.db.session import get_session, DATABASE_URL
|
||||
@@ -37,15 +37,18 @@ class EventProcessor:
|
||||
app_name: str | None = None,
|
||||
poll_interval: float = 2.0,
|
||||
batch_size: int = 10,
|
||||
stuck_timeout: float = 300.0,
|
||||
):
|
||||
self._app_name = app_name
|
||||
self._poll_interval = poll_interval
|
||||
self._batch_size = batch_size
|
||||
self._stuck_timeout = stuck_timeout # seconds before "processing" → "pending"
|
||||
self._task: asyncio.Task | None = None
|
||||
self._listen_task: asyncio.Task | None = None
|
||||
self._listen_conn: asyncpg.Connection | None = None
|
||||
self._wake = asyncio.Event()
|
||||
self._running = False
|
||||
self._reap_counter = 0
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Lifecycle
|
||||
@@ -119,6 +122,12 @@ class EventProcessor:
|
||||
async def _poll_loop(self) -> None:
|
||||
while self._running:
|
||||
try:
|
||||
# Periodically recover stuck activities (~every 30 cycles)
|
||||
self._reap_counter += 1
|
||||
if self._reap_counter >= 30:
|
||||
self._reap_counter = 0
|
||||
await self._recover_stuck()
|
||||
|
||||
# Clear before processing so any NOTIFY that arrives during
|
||||
# _process_batch sets the event and we loop immediately.
|
||||
self._wake.clear()
|
||||
@@ -137,6 +146,37 @@ class EventProcessor:
|
||||
traceback.print_exc()
|
||||
await asyncio.sleep(self._poll_interval)
|
||||
|
||||
async def _recover_stuck(self) -> None:
|
||||
"""Reset activities stuck in 'processing' back to 'pending'.
|
||||
|
||||
This handles the case where a process crashed mid-handler.
|
||||
Combined with idempotent handlers, this gives at-least-once delivery.
|
||||
"""
|
||||
cutoff = datetime.now(timezone.utc) - timedelta(seconds=self._stuck_timeout)
|
||||
try:
|
||||
async with get_session() as session:
|
||||
filters = [
|
||||
APActivity.process_state == "processing",
|
||||
APActivity.created_at < cutoff,
|
||||
]
|
||||
if self._app_name:
|
||||
filters.append(APActivity.origin_app == self._app_name)
|
||||
result = await session.execute(
|
||||
update(APActivity)
|
||||
.where(*filters)
|
||||
.values(process_state="pending")
|
||||
.returning(APActivity.id)
|
||||
)
|
||||
recovered = result.scalars().all()
|
||||
await session.commit()
|
||||
if recovered:
|
||||
log.warning(
|
||||
"Recovered %d stuck activities: %s",
|
||||
len(recovered), recovered,
|
||||
)
|
||||
except Exception:
|
||||
log.exception("Failed to recover stuck activities")
|
||||
|
||||
async def _process_batch(self) -> int:
|
||||
"""Fetch and process a batch of pending activities. Returns count processed."""
|
||||
processed = 0
|
||||
|
||||
Reference in New Issue
Block a user