Add at-least-once delivery + idempotent federation handler

- EventProcessor now recovers stuck "processing" activities back to
  "pending" after 5 minutes (handles process crashes)
- New ap_delivery_log table records successful inbox deliveries
- Federation delivery handler checks the log before sending, so
  retries skip already-delivered inboxes
- Together these give at-least-once + idempotent semantics

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
giles
2026-02-23 16:16:55 +00:00
parent 8951a62b90
commit 9cb8cf9e1d
4 changed files with 140 additions and 12 deletions

View File

@@ -0,0 +1,30 @@
"""Add ap_delivery_log table for idempotent federation delivery
Revision ID: s9q7n3o5p6
Revises: r8p6m2n4o5
"""
from alembic import op
import sqlalchemy as sa
revision = "s9q7n3o5p6"
down_revision = "r8p6m2n4o5"
branch_labels = None
depends_on = None
def upgrade():
op.create_table(
"ap_delivery_log",
sa.Column("id", sa.Integer, primary_key=True, autoincrement=True),
sa.Column("activity_id", sa.Integer, sa.ForeignKey("ap_activities.id", ondelete="CASCADE"), nullable=False),
sa.Column("inbox_url", sa.String(512), nullable=False),
sa.Column("status_code", sa.Integer, nullable=True),
sa.Column("delivered_at", sa.DateTime(timezone=True), nullable=False, server_default=sa.func.now()),
sa.UniqueConstraint("activity_id", "inbox_url", name="uq_delivery_activity_inbox"),
)
op.create_index("ix_ap_delivery_activity", "ap_delivery_log", ["activity_id"])
def downgrade():
op.drop_index("ix_ap_delivery_activity", table_name="ap_delivery_log")
op.drop_table("ap_delivery_log")

View File

@@ -2,6 +2,9 @@
Registered as a wildcard handler — fires for every activity. Skips Registered as a wildcard handler — fires for every activity. Skips
non-public activities and those without an actor profile. non-public activities and those without an actor profile.
Idempotent: successful deliveries are recorded in ap_delivery_log.
On retry (at-least-once reaper), already-delivered inboxes are skipped.
""" """
from __future__ import annotations from __future__ import annotations
@@ -12,7 +15,7 @@ from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from shared.events.bus import register_activity_handler from shared.events.bus import register_activity_handler
from shared.models.federation import ActorProfile, APActivity, APFollower from shared.models.federation import ActorProfile, APActivity, APFollower, APDeliveryLog
from shared.services.registry import services from shared.services.registry import services
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@@ -67,8 +70,8 @@ async def _deliver_to_inbox(
body: dict, body: dict,
actor: ActorProfile, actor: ActorProfile,
domain: str, domain: str,
) -> bool: ) -> int | None:
"""POST signed activity to a single inbox. Returns True on success.""" """POST signed activity to a single inbox. Returns status code or None on error."""
from shared.utils.http_signatures import sign_request from shared.utils.http_signatures import sign_request
from urllib.parse import urlparse from urllib.parse import urlparse
import json import json
@@ -96,13 +99,12 @@ async def _deliver_to_inbox(
) )
if resp.status_code < 300: if resp.status_code < 300:
log.info("Delivered to %s%d", inbox_url, resp.status_code) log.info("Delivered to %s%d", inbox_url, resp.status_code)
return True
else: else:
log.warning("Delivery to %s%d: %s", inbox_url, resp.status_code, resp.text[:200]) log.warning("Delivery to %s%d: %s", inbox_url, resp.status_code, resp.text[:200])
return False return resp.status_code
except Exception: except Exception:
log.exception("Delivery failed for %s", inbox_url) log.exception("Delivery failed for %s", inbox_url)
return False return None
async def on_any_activity(activity: APActivity, session: AsyncSession) -> None: async def on_any_activity(activity: APActivity, session: AsyncSession) -> None:
@@ -140,12 +142,34 @@ async def on_any_activity(activity: APActivity, session: AsyncSession) -> None:
log.debug("No followers to deliver to for %s", activity.activity_id) log.debug("No followers to deliver to for %s", activity.activity_id)
return return
# Deduplicate inboxes
all_inboxes = {f.follower_inbox for f in followers if f.follower_inbox}
# Check delivery log — skip inboxes we already delivered to (idempotency)
existing = (
await session.execute(
select(APDeliveryLog.inbox_url).where(
APDeliveryLog.activity_id == activity.id,
APDeliveryLog.status_code < 300,
)
)
).scalars().all()
already_delivered = set(existing)
inboxes = all_inboxes - already_delivered
if not inboxes:
log.info("All %d inbox(es) already delivered for %s", len(all_inboxes), activity.activity_id)
return
if already_delivered:
log.info(
"Skipping %d already-delivered inbox(es), delivering to %d remaining",
len(already_delivered), len(inboxes),
)
# Build activity JSON # Build activity JSON
activity_json = _build_activity_json(activity, actor, domain) activity_json = _build_activity_json(activity, actor, domain)
# Deduplicate inboxes
inboxes = {f.follower_inbox for f in followers if f.follower_inbox}
log.info( log.info(
"Delivering %s to %d inbox(es) for @%s", "Delivering %s to %d inbox(es) for @%s",
activity.activity_type, len(inboxes), actor.preferred_username, activity.activity_type, len(inboxes), actor.preferred_username,
@@ -153,7 +177,17 @@ async def on_any_activity(activity: APActivity, session: AsyncSession) -> None:
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
for inbox_url in inboxes: for inbox_url in inboxes:
await _deliver_to_inbox(client, inbox_url, activity_json, actor, domain) status_code = await _deliver_to_inbox(
client, inbox_url, activity_json, actor, domain
)
# Log successful deliveries for idempotency
if status_code is not None and status_code < 300:
session.add(APDeliveryLog(
activity_id=activity.id,
inbox_url=inbox_url,
status_code=status_code,
))
await session.flush()
# Wildcard: fires for every activity # Wildcard: fires for every activity

View File

@@ -15,10 +15,10 @@ from __future__ import annotations
import asyncio import asyncio
import logging import logging
import traceback import traceback
from datetime import datetime, timezone from datetime import datetime, timedelta, timezone
import asyncpg import asyncpg
from sqlalchemy import select from sqlalchemy import select, update
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from shared.db.session import get_session, DATABASE_URL from shared.db.session import get_session, DATABASE_URL
@@ -37,15 +37,18 @@ class EventProcessor:
app_name: str | None = None, app_name: str | None = None,
poll_interval: float = 2.0, poll_interval: float = 2.0,
batch_size: int = 10, batch_size: int = 10,
stuck_timeout: float = 300.0,
): ):
self._app_name = app_name self._app_name = app_name
self._poll_interval = poll_interval self._poll_interval = poll_interval
self._batch_size = batch_size self._batch_size = batch_size
self._stuck_timeout = stuck_timeout # seconds before "processing" → "pending"
self._task: asyncio.Task | None = None self._task: asyncio.Task | None = None
self._listen_task: asyncio.Task | None = None self._listen_task: asyncio.Task | None = None
self._listen_conn: asyncpg.Connection | None = None self._listen_conn: asyncpg.Connection | None = None
self._wake = asyncio.Event() self._wake = asyncio.Event()
self._running = False self._running = False
self._reap_counter = 0
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Lifecycle # Lifecycle
@@ -119,6 +122,12 @@ class EventProcessor:
async def _poll_loop(self) -> None: async def _poll_loop(self) -> None:
while self._running: while self._running:
try: try:
# Periodically recover stuck activities (~every 30 cycles)
self._reap_counter += 1
if self._reap_counter >= 30:
self._reap_counter = 0
await self._recover_stuck()
# Clear before processing so any NOTIFY that arrives during # Clear before processing so any NOTIFY that arrives during
# _process_batch sets the event and we loop immediately. # _process_batch sets the event and we loop immediately.
self._wake.clear() self._wake.clear()
@@ -137,6 +146,37 @@ class EventProcessor:
traceback.print_exc() traceback.print_exc()
await asyncio.sleep(self._poll_interval) await asyncio.sleep(self._poll_interval)
async def _recover_stuck(self) -> None:
"""Reset activities stuck in 'processing' back to 'pending'.
This handles the case where a process crashed mid-handler.
Combined with idempotent handlers, this gives at-least-once delivery.
"""
cutoff = datetime.now(timezone.utc) - timedelta(seconds=self._stuck_timeout)
try:
async with get_session() as session:
filters = [
APActivity.process_state == "processing",
APActivity.created_at < cutoff,
]
if self._app_name:
filters.append(APActivity.origin_app == self._app_name)
result = await session.execute(
update(APActivity)
.where(*filters)
.values(process_state="pending")
.returning(APActivity.id)
)
recovered = result.scalars().all()
await session.commit()
if recovered:
log.warning(
"Recovered %d stuck activities: %s",
len(recovered), recovered,
)
except Exception:
log.exception("Failed to recover stuck activities")
async def _process_batch(self) -> int: async def _process_batch(self) -> int:
"""Fetch and process a batch of pending activities. Returns count processed.""" """Fetch and process a batch of pending activities. Returns count processed."""
processed = 0 processed = 0

View File

@@ -427,3 +427,27 @@ class APNotification(Base):
Index("ix_ap_notification_read", "actor_profile_id", "read"), Index("ix_ap_notification_read", "actor_profile_id", "read"),
Index("ix_ap_notification_created", "created_at"), Index("ix_ap_notification_created", "created_at"),
) )
class APDeliveryLog(Base):
"""Tracks successful deliveries of activities to remote inboxes.
Used for idempotency: the delivery handler skips inboxes that already
have a success row, so retries after a crash never send duplicates.
"""
__tablename__ = "ap_delivery_log"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
activity_id: Mapped[int] = mapped_column(
Integer, ForeignKey("ap_activities.id", ondelete="CASCADE"), nullable=False,
)
inbox_url: Mapped[str] = mapped_column(String(512), nullable=False)
status_code: Mapped[int | None] = mapped_column(Integer, nullable=True)
delivered_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), nullable=False, server_default=func.now(),
)
__table_args__ = (
UniqueConstraint("activity_id", "inbox_url", name="uq_delivery_activity_inbox"),
Index("ix_ap_delivery_activity", "activity_id"),
)