Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 44s
Wires the delivery_worker's retry loop on top of the erlang:send_after / cancel_timer primitives just landed on loops/erlang (3709460d,98b0104c,779e53b2— cherry-picked here since origin/architecture hasn't caught up yet). Surface: - new :timers [{Cid, Ref}] state field tracks live timer refs - handle_call(flush): drain (existing semantics) + arm_retry_timer per retried Cid (computes backoff slot from the now-bumped attempt count, sets next_retry_at, send_after self-cast). Reply shape unchanged. - handle_info({retry, Cid}, S): redrives that one Cid through deliver_one_pure. Success → record_success_pure + clear pending. Failure → schedule_retry_for (which bumps attempts, dead-letters on slot 6, or arms next slot). - cancel_timer_for/2 before arming a new timer so stale timers don't keep the scheduler's run loop alive after the work is done. - state_srv/1 + timer_ref_for/2 for test introspection. 5/5 in new delivery_retry_timer.sh; existing delivery_worker.sh 17/17 and delivery_retry.sh 11/11 still green. Conformance gate 771/771 (was 761/761; the +10 is the cherry-picked send_after suite). Closes Blockers #3. m2 is now feature-complete. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
132 lines
6.4 KiB
Bash
Executable File
132 lines
6.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# next/tests/delivery_retry_timer.sh — m2 Step 8b-timer.
|
|
#
|
|
# Live timer wiring on the delivery_worker gen_server. The pure
|
|
# bookkeeping is covered by delivery_retry.sh — this suite proves the
|
|
# erlang:send_after / cancel_timer wiring fires retries from the
|
|
# scheduler's logical clock without anyone calling drain by hand.
|
|
#
|
|
# Substrate dependency: erlang:send_after/3 + cancel_timer/1 +
|
|
# monotonic_time/0,1 — landed via cherry-pick from loops/erlang
|
|
# (commits 3709460d / 98b0104c / 779e53b2 on this branch).
|
|
#
|
|
# Test discipline: every test cancels its leftover timer before
|
|
# returning. If we don't, the scheduler keeps the run loop alive
|
|
# advancing time through the full backoff chain (30s → 5m → 30m →
|
|
# 6h → 24h), and each tick costs ~10s of wall time inside the
|
|
# Erlang-on-SX VM. Canceling the trailing timer is the difference
|
|
# between a 25s test and a 60s+ test.
|
|
|
|
set -uo pipefail
|
|
cd "$(git rev-parse --show-toplevel)"
|
|
|
|
SX_SERVER="${SX_SERVER:-hosts/ocaml/_build/default/bin/sx_server.exe}"
|
|
if [ ! -x "$SX_SERVER" ]; then
|
|
SX_SERVER="/root/rose-ash/hosts/ocaml/_build/default/bin/sx_server.exe"
|
|
fi
|
|
if [ ! -x "$SX_SERVER" ]; then
|
|
echo "ERROR: sx_server.exe not found." >&2
|
|
exit 1
|
|
fi
|
|
|
|
VERBOSE="${1:-}"
|
|
PASS=0; FAIL=0; ERRORS=""
|
|
TMPFILE=$(mktemp); trap "rm -f $TMPFILE" EXIT
|
|
|
|
# A canned activity with cid <<1,2,3>>.
|
|
SETUP='Act = [{id, <<1,2,3>>}, {type, note}, {actor, alice}], FailFn = fun(_) -> {error, transient} end,'
|
|
|
|
# Convenience: cancel any leftover timer for cid <<1,2,3>> on Peer.
|
|
# Prevents the scheduler from grinding through 30s/5m/30m/6h/24h of
|
|
# retries between epochs.
|
|
CANCEL='CancelLeftover = fun(Peer) -> SS = delivery_worker:state_srv(Peer), case delivery_worker:timer_ref_for(<<1,2,3>>, SS) of undefined -> ok; LRef -> erlang:cancel_timer(LRef), ok end end,'
|
|
|
|
cat > "$TMPFILE" <<EPOCHS
|
|
(epoch 1)
|
|
(load "lib/erlang/tokenizer.sx")
|
|
(load "lib/erlang/parser.sx")
|
|
(load "lib/erlang/parser-core.sx")
|
|
(load "lib/erlang/parser-expr.sx")
|
|
(load "lib/erlang/parser-module.sx")
|
|
(load "lib/erlang/transpile.sx")
|
|
(load "lib/erlang/runtime.sx")
|
|
(load "lib/erlang/vm/dispatcher.sx")
|
|
(epoch 2)
|
|
(eval "(er-load-gen-server!)")
|
|
(eval "(get (erlang-load-module (file-read \"next/kernel/envelope.erl\")) :name)")
|
|
(eval "(get (erlang-load-module (file-read \"next/kernel/delivery_worker.erl\")) :name)")
|
|
|
|
;; T1 — a failing flush schedules a retry timer. timer_ref_for
|
|
;; returns a live Ref (not undefined). Then cancel before
|
|
;; returning so the scheduler doesn't grind the full backoff
|
|
;; chain trying to retry.
|
|
(epoch 10)
|
|
(eval "(get (erlang-eval-ast \"${SETUP}${CANCEL} delivery_worker:start_link(bob, FailFn), delivery_worker:enqueue(bob, Act), {ok, [], [<<1,2,3>>]} = delivery_worker:flush(bob), S = delivery_worker:state_srv(bob), Ref = delivery_worker:timer_ref_for(<<1,2,3>>, S), Result = is_reference(Ref), CancelLeftover(bob), Result\") :name)")
|
|
|
|
;; T2 — initial flush bumps the attempt counter to 1; next_retry_at
|
|
;; gets set; cancel the timer before returning.
|
|
(epoch 11)
|
|
(eval "(get (erlang-eval-ast \"${SETUP}${CANCEL} delivery_worker:start_link(bob, FailFn), delivery_worker:enqueue(bob, Act), delivery_worker:flush(bob), S = delivery_worker:state_srv(bob), Result = delivery_worker:attempts_for(<<1,2,3>>, S) =:= 1, CancelLeftover(bob), Result\") :name)")
|
|
|
|
;; T3 — advancing the logical clock past the 30s backoff fires the
|
|
;; timer; handle_info({retry, Cid}) bumps attempts to 2 and arms
|
|
;; the next slot (backoff(2)=300s). Then cancel the new timer.
|
|
(epoch 12)
|
|
(eval "(get (erlang-eval-ast \"${SETUP}${CANCEL} delivery_worker:start_link(bob, FailFn), delivery_worker:enqueue(bob, Act), delivery_worker:flush(bob), receive after 31000 -> ok end, S = delivery_worker:state_srv(bob), Result = delivery_worker:attempts_for(<<1,2,3>>, S) =:= 2, CancelLeftover(bob), Result\") :name)")
|
|
|
|
;; T4 — after the retry fires the worker has armed a fresh timer
|
|
;; for the next backoff slot. Confirm it's a live ref, then
|
|
;; cancel it.
|
|
(epoch 13)
|
|
(eval "(get (erlang-eval-ast \"${SETUP}${CANCEL} delivery_worker:start_link(bob, FailFn), delivery_worker:enqueue(bob, Act), delivery_worker:flush(bob), receive after 31000 -> ok end, S = delivery_worker:state_srv(bob), Result = is_reference(delivery_worker:timer_ref_for(<<1,2,3>>, S)), CancelLeftover(bob), Result\") :name)")
|
|
|
|
;; T5 — successful retry path. Dispatch fails twice then succeeds
|
|
;; (ets-backed counter). After two backoff slots elapse
|
|
;; (30s, then 300s), the third attempt succeeds and
|
|
;; record_success_pure clears the per-cid bookkeeping. No new
|
|
;; timer is scheduled, so the scheduler terminates naturally.
|
|
(epoch 14)
|
|
(eval "(get (erlang-eval-ast \"${SETUP} ets:new(rt_ctr, [named_table, public]), ets:insert(rt_ctr, {n, 0}), Mixed = fun(_) -> [{n, N}] = ets:lookup(rt_ctr, n), ets:insert(rt_ctr, {n, N+1}), case N < 2 of true -> {error, transient}; false -> ok end end, delivery_worker:start_link(carol, Mixed), delivery_worker:enqueue(carol, Act), delivery_worker:flush(carol), receive after 31000 -> ok end, receive after 301000 -> ok end, S = delivery_worker:state_srv(carol), delivery_worker:pending(S) =:= [] andalso delivery_worker:attempts_for(<<1,2,3>>, S) =:= 0 andalso delivery_worker:timer_ref_for(<<1,2,3>>, S) =:= undefined\") :name)")
|
|
EPOCHS
|
|
|
|
OUTPUT=$(timeout 900 "$SX_SERVER" < "$TMPFILE" 2>/dev/null)
|
|
|
|
check() {
|
|
local epoch="$1" desc="$2" expected="$3"
|
|
local actual
|
|
actual=$(echo "$OUTPUT" | awk -v e="$epoch" '
|
|
$0 ~ "^\\(ok-len " e " " { getline; print; exit }
|
|
$0 ~ "^\\(ok " e " " { print; exit }
|
|
$0 ~ "^\\(error " e " " { print; exit }
|
|
')
|
|
[ -z "$actual" ] && actual="<no output for epoch $epoch>"
|
|
if echo "$actual" | grep -qF -- "$expected"; then
|
|
PASS=$((PASS+1))
|
|
[ "$VERBOSE" = "-v" ] && echo " ok $desc"
|
|
else
|
|
FAIL=$((FAIL+1))
|
|
ERRORS+=" FAIL [$desc] (epoch $epoch) expected: $expected | actual: $actual
|
|
"
|
|
fi
|
|
}
|
|
|
|
check 10 "T1 flush schedules a timer" "true"
|
|
check 11 "T2 initial flush bumps attempts to 1" "true"
|
|
check 12 "T3 timer fires; attempts=2" "true"
|
|
check 13 "T4 retry rearms next timer" "true"
|
|
check 14 "T5 success clears retry state" "true"
|
|
|
|
TOTAL=$((PASS+FAIL))
|
|
if [ $FAIL -eq 0 ]; then
|
|
echo "ok $PASS/$TOTAL next/tests/delivery_retry_timer.sh passed"
|
|
else
|
|
echo "FAIL $PASS/$TOTAL passed, $FAIL failed:"
|
|
echo "$ERRORS"
|
|
if [ "$VERBOSE" = "-v" ]; then
|
|
echo "--- sx_server output ---"
|
|
echo "$OUTPUT" | tail -40
|
|
echo "---"
|
|
fi
|
|
fi
|
|
[ $FAIL -eq 0 ]
|