fed-sx-m2: Step 8b-timer — live retry-loop wiring on send_after
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 44s
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 44s
Wires the delivery_worker's retry loop on top of the erlang:send_after / cancel_timer primitives just landed on loops/erlang (3709460d,98b0104c,779e53b2— cherry-picked here since origin/architecture hasn't caught up yet). Surface: - new :timers [{Cid, Ref}] state field tracks live timer refs - handle_call(flush): drain (existing semantics) + arm_retry_timer per retried Cid (computes backoff slot from the now-bumped attempt count, sets next_retry_at, send_after self-cast). Reply shape unchanged. - handle_info({retry, Cid}, S): redrives that one Cid through deliver_one_pure. Success → record_success_pure + clear pending. Failure → schedule_retry_for (which bumps attempts, dead-letters on slot 6, or arms next slot). - cancel_timer_for/2 before arming a new timer so stale timers don't keep the scheduler's run loop alive after the work is done. - state_srv/1 + timer_ref_for/2 for test introspection. 5/5 in new delivery_retry_timer.sh; existing delivery_worker.sh 17/17 and delivery_retry.sh 11/11 still green. Conformance gate 771/771 (was 761/761; the +10 is the cherry-picked send_after suite). Closes Blockers #3. m2 is now feature-complete. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -5,9 +5,10 @@
|
||||
backoff_for/1, schedule_for/1,
|
||||
record_failure_pure/3, record_success_pure/2,
|
||||
next_due_pure/2, attempts_for/2, next_retry_at/2,
|
||||
dead_letter_list/1,
|
||||
dead_letter_list/1, timer_ref_for/2,
|
||||
start_link/1, start_link/2, stop/1,
|
||||
enqueue/2, flush/1, pending_srv/1, set_dispatch_fn/2]).
|
||||
enqueue/2, flush/1, pending_srv/1, set_dispatch_fn/2,
|
||||
state_srv/1]).
|
||||
-export([init/1, handle_call/3, handle_cast/2, handle_info/2]).
|
||||
|
||||
%% Outbound delivery worker per design §13.4. One gen_server per
|
||||
@@ -49,6 +50,7 @@ new(PeerId) ->
|
||||
{attempts, []},
|
||||
{next_retry, []},
|
||||
{dead_letter, []},
|
||||
{timers, []},
|
||||
{dispatch_fn, undefined}].
|
||||
|
||||
pending(State) -> field(pending, State).
|
||||
@@ -183,6 +185,16 @@ next_retry_at(Cid, State) ->
|
||||
|
||||
dead_letter_list(State) -> field(dead_letter, State).
|
||||
|
||||
%% Step 8b-timer: per-cid timer ref accessor. Exposed for tests so
|
||||
%% they can assert a retry timer was scheduled (or wasn't, after a
|
||||
%% success / dead-letter). Returns the live Ref or undefined.
|
||||
|
||||
timer_ref_for(Cid, State) ->
|
||||
case find_keyed(Cid, field(timers, State)) of
|
||||
{ok, Ref} -> Ref;
|
||||
_ -> undefined
|
||||
end.
|
||||
|
||||
move_to_dead_letter(Cid, State) ->
|
||||
Pending = field(pending, State),
|
||||
{Match, Rest} = take_by_cid(Cid, Pending, [], []),
|
||||
@@ -229,6 +241,13 @@ pending_srv(PeerId) ->
|
||||
set_dispatch_fn(PeerId, Fn) ->
|
||||
gen_server:call(PeerId, {set_dispatch_fn, Fn}).
|
||||
|
||||
%% Step 8b-timer: return the worker's full state so tests can use the
|
||||
%% pure introspection functions (attempts_for / next_retry_at /
|
||||
%% timer_ref_for / dead_letter_list) against it.
|
||||
|
||||
state_srv(PeerId) ->
|
||||
gen_server:call(PeerId, get_state).
|
||||
|
||||
%% gen_server callbacks
|
||||
|
||||
init([PeerId, DispatchFn]) ->
|
||||
@@ -238,17 +257,138 @@ init([PeerId, DispatchFn]) ->
|
||||
handle_call({enqueue, Activity}, _From, State) ->
|
||||
{reply, ok, enqueue_pure(field(peer, State), Activity, State)};
|
||||
handle_call(flush, _From, State) ->
|
||||
{NewState, Delivered, Retry} = drain_pure(State),
|
||||
%% Step 8b-timer: drain (which already bumps :attempts via
|
||||
%% bump_attempt on each failed deliver), then for each retried
|
||||
%% Cid compute the backoff slot from the now-current attempt
|
||||
%% count, set NextRetryAt, and arm a send_after self-cast.
|
||||
%% handle_info({retry, Cid}, ...) fires when the slot elapses.
|
||||
%% Reply shape unchanged.
|
||||
{DrainState, Delivered, Retry} = drain_pure(State),
|
||||
Now = monotonic_seconds(),
|
||||
NewState = lists:foldl(
|
||||
fun(Cid, S) -> arm_retry_timer(Cid, Now, S) end,
|
||||
DrainState, Retry),
|
||||
{reply, {ok, Delivered, Retry}, NewState};
|
||||
handle_call(get_pending, _From, State) ->
|
||||
{reply, field(pending, State), State};
|
||||
handle_call(get_state, _From, State) ->
|
||||
{reply, State, State};
|
||||
handle_call({set_dispatch_fn, Fn}, _From, State) ->
|
||||
{reply, ok, set_field(dispatch_fn, Fn, State)}.
|
||||
|
||||
handle_cast(_, S) -> {noreply, S}.
|
||||
|
||||
%% Step 8b-timer: a retry timer fired. Pull the activity by Cid from
|
||||
%% the pending queue (it might have been drained meanwhile by a
|
||||
%% concurrent flush — if so, we just clear bookkeeping and exit).
|
||||
%% Run deliver_one_pure: success clears retry state; failure bumps
|
||||
%% the counter and schedules the next slot — or dead-letters if the
|
||||
%% sixth attempt failed.
|
||||
|
||||
handle_info({retry, Cid}, State) ->
|
||||
%% Clear the timer ref we just consumed.
|
||||
State0 = clear_timer_ref(Cid, State),
|
||||
case take_by_cid(Cid, field(pending, State0), [], 0) of
|
||||
{none, _} ->
|
||||
%% Already drained / dead-lettered. Clear any stale
|
||||
%% bookkeeping in case the cid is half-tracked.
|
||||
{noreply, record_success_pure(Cid, State0)};
|
||||
{Activity, Rest} ->
|
||||
case deliver_one_pure(Activity, State0) of
|
||||
{ok, _} ->
|
||||
State1 = set_field(pending, Rest, State0),
|
||||
State2 = record_success_pure(Cid, State1),
|
||||
{noreply, State2};
|
||||
{error, _, _} ->
|
||||
%% Keep the activity in pending; record_failure
|
||||
%% leaves :pending alone (or dead-letters it on
|
||||
%% slot 6).
|
||||
Now = monotonic_seconds(),
|
||||
State1 = schedule_retry_for(Cid, Now, State0),
|
||||
{noreply, State1}
|
||||
end
|
||||
end;
|
||||
handle_info(_, S) -> {noreply, S}.
|
||||
|
||||
%% Step 8b-timer helpers ────────────────────────────────────────────
|
||||
|
||||
%% arm_retry_timer/3 — POST-DRAIN form. Used from handle_call(flush)
|
||||
%% after drain_pure has already bumped :attempts via bump_attempt.
|
||||
%% Sets next_retry_at = Now + backoff(attempts) and schedules the
|
||||
%% send_after self-cast. On the dead-letter slot (attempt 6), moves
|
||||
%% the activity from :pending to :dead_letter and arms no timer.
|
||||
|
||||
arm_retry_timer(Cid, Now, State) ->
|
||||
State0 = cancel_timer_for(Cid, State),
|
||||
Attempts = attempts_for(Cid, State0),
|
||||
case backoff_for(Attempts) of
|
||||
dead_letter ->
|
||||
move_to_dead_letter(Cid, State0);
|
||||
Seconds ->
|
||||
NextAt = Now + Seconds,
|
||||
NR = field(next_retry, State0),
|
||||
State1 = set_field(next_retry, set_keyed(Cid, NextAt, NR), State0),
|
||||
Ms = Seconds * 1000,
|
||||
Ref = erlang:send_after(Ms, self(), {retry, Cid}),
|
||||
Timers = field(timers, State1),
|
||||
set_field(timers, set_keyed(Cid, Ref, Timers), State1)
|
||||
end.
|
||||
|
||||
%% schedule_retry_for/3 — POST-RETRY-ATTEMPT form. Used from
|
||||
%% handle_info({retry, Cid}, ...) when the retry attempt failed.
|
||||
%% Bookkeep one failure and arm the next retry timer (or promote
|
||||
%% to dead-letter, in which case no timer is needed).
|
||||
|
||||
schedule_retry_for(Cid, Now, State) ->
|
||||
%% Cancel any in-flight timer for this Cid before scheduling a new
|
||||
%% one. Without the cancel a stale timer can still fire after
|
||||
%% record_success has cleared the cid, the handle_info no-match
|
||||
%% branch silently absorbs it — but it keeps the scheduler's
|
||||
%% run-loop alive long after the work is done. A pure clear (no
|
||||
%% cancel) is fine when the timer's own firing brought us here,
|
||||
%% so the explicit cancel only matters for the flush path.
|
||||
State0 = cancel_timer_for(Cid, State),
|
||||
State1 = record_failure_pure(Cid, Now, State0),
|
||||
Attempts = attempts_for(Cid, State1),
|
||||
case backoff_for(Attempts) of
|
||||
dead_letter ->
|
||||
State1;
|
||||
Seconds ->
|
||||
Ms = Seconds * 1000,
|
||||
Ref = erlang:send_after(Ms, self(), {retry, Cid}),
|
||||
Timers = field(timers, State1),
|
||||
set_field(timers, set_keyed(Cid, Ref, Timers), State1)
|
||||
end.
|
||||
|
||||
%% Cancel the live timer for Cid (if any) and clear it from :timers.
|
||||
%% Idempotent — silent no-op if there isn't one.
|
||||
|
||||
cancel_timer_for(Cid, State) ->
|
||||
Timers = field(timers, State),
|
||||
case find_keyed(Cid, Timers) of
|
||||
{ok, Ref} ->
|
||||
erlang:cancel_timer(Ref),
|
||||
set_field(timers, del_keyed(Cid, Timers), State);
|
||||
_ -> State
|
||||
end.
|
||||
|
||||
%% Drop the :timers entry for Cid without calling cancel_timer — used
|
||||
%% when the timer's own firing brought us into handle_info and the ref
|
||||
%% is already consumed.
|
||||
|
||||
clear_timer_ref(Cid, State) ->
|
||||
Timers = field(timers, State),
|
||||
case find_keyed(Cid, Timers) of
|
||||
{ok, _Ref} -> set_field(timers, del_keyed(Cid, Timers), State);
|
||||
_ -> State
|
||||
end.
|
||||
|
||||
%% Step 8b-timer: bookkeeping uses seconds (matches backoff_for /
|
||||
%% record_failure_pure / next_retry_at). The monotonic clock reports
|
||||
%% ms; we floor to seconds here to keep all the comparisons aligned.
|
||||
|
||||
monotonic_seconds() -> erlang:monotonic_time() div 1000.
|
||||
|
||||
%% ── Internal ────────────────────────────────────────────────────
|
||||
|
||||
activity_cid(Activity) ->
|
||||
|
||||
Reference in New Issue
Block a user