artdag: fault-tolerant execution — confined failure, cache never poisoned + 14 tests
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 1m4s

fault.sx run-safe: a node op may return (artdag/fail reason); failure is confined
to that node + downstream dependents while independent branches compute, and failed
results are never cached, so retry after a fix recomputes only the failed closure
and hits the good nodes. fault 14/14, total 158/158.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-07 12:32:14 +00:00
parent f29d8c047b
commit 28fed7c799
6 changed files with 216 additions and 6 deletions

View File

@@ -13,7 +13,7 @@ if [ ! -x "$SX_SERVER" ]; then
exit 1
fi
SUITES=(dag analyze plan execute optimize fed cost serialize stats)
SUITES=(dag analyze plan execute optimize fed cost serialize stats fault)
OUT_JSON="lib/artdag/scoreboard.json"
OUT_MD="lib/artdag/scoreboard.md"
@@ -50,6 +50,7 @@ run_suite() {
(load "lib/artdag/cost.sx")
(load "lib/artdag/serialize.sx")
(load "lib/artdag/stats.sx")
(load "lib/artdag/fault.sx")
(epoch 2)
(eval "(define artdag-test-pass 0)")
(eval "(define artdag-test-fail 0)")

56
lib/artdag/fault.sx Normal file
View File

@@ -0,0 +1,56 @@
; lib/artdag/fault.sx — fault-tolerant execution. A node op may fail by returning
; (artdag/fail reason); the failure is confined to that node and its transitive
; dependents (which cannot run without it), while independent branches still
; compute. Failed results are NEVER cached, so a later run with the fault fixed
; recomputes only the failed closure. Depends on execute.sx and plan.sx.
(define artdag/fail (fn (reason) {:artdag-fail true :reason reason}))
(define artdag/failed? (fn (v) (and (dict? v) (has-key? v :artdag-fail))))
(define
artdag/-exec-safe-node
(fn
(dag runner cache acc id)
(let
((node (artdag/dag-get dag id)))
(let
((ins (artdag/node-inputs node)))
(if
(some (fn (in) (artdag/member? in (get acc :failed))) ins)
(assoc acc :failed (concat (get acc :failed) (list id)))
(if
(persist/kv-has? cache id)
(assoc
acc
:results (assoc (get acc :results) id (persist/kv-get cache id))
:hits (concat (get acc :hits) (list id)))
(let
((inputs (map (fn (in) (artdag/-input-result (get acc :results) cache in)) ins)))
(let
((result (runner (artdag/node-op node) (artdag/node-params node) inputs)))
(if
(artdag/failed? result)
(assoc acc :failed (concat (get acc :failed) (list id)))
(begin
(persist/kv-put cache id result)
(assoc
acc
:results (assoc (get acc :results) id result)
:recomputed (concat (get acc :recomputed) (list id)))))))))))))
(define
artdag/run-safe
(fn
(dag runner cache)
(reduce
(fn (acc id) (artdag/-exec-safe-node dag runner cache acc id))
{:recomputed (list) :results {} :hits (list) :failed (list)}
(artdag/plan-flatten (artdag/plan dag 0)))))
(define
artdag/failed-nodes
(fn (exec) (artdag/sort-strings (get exec :failed))))
(define artdag/failure-count (fn (exec) (len (get exec :failed))))
(define
artdag/all-ok?
(fn (exec) (= (len (get exec :failed)) 0)))

View File

@@ -8,9 +8,10 @@
"fed": {"pass": 15, "fail": 0},
"cost": {"pass": 13, "fail": 0},
"serialize": {"pass": 13, "fail": 0},
"stats": {"pass": 12, "fail": 0}
"stats": {"pass": 12, "fail": 0},
"fault": {"pass": 14, "fail": 0}
},
"total_pass": 144,
"total_pass": 158,
"total_fail": 0,
"total": 144
"total": 158
}

View File

@@ -13,4 +13,5 @@ _Generated by `lib/artdag/conformance.sh`_
| cost | 13 | 0 | 13 |
| serialize | 13 | 0 | 13 |
| stats | 12 | 0 | 12 |
| **Total** | **144** | **0** | **144** |
| fault | 14 | 0 | 14 |
| **Total** | **158** | **0** | **158** |

144
lib/artdag/tests/fault.sx Normal file
View File

@@ -0,0 +1,144 @@
; fault-tolerant execution: failure confined to its closure, cache never poisoned.
(define ft-BAD (artdag/op-table-runner {:boom (fn (p i) (artdag/fail "kaboom")) :in (fn (p i) (get p :v)) :add (fn (p i) (+ (nth i 0) (nth i 1))) :inc (fn (p i) (+ 1 (first i)))}))
(define ft-GOOD (artdag/op-table-runner {:boom (fn (p i) 99) :in (fn (p i) (get p :v)) :add (fn (p i) (+ (nth i 0) (nth i 1))) :inc (fn (p i) (+ 1 (first i)))}))
; p,q leaves; b=inc(p) (independent); c=boom(q); d=add(b,c)
(define
ft-D
(artdag/build
(list
(list "p" "in" (list) {:v 10})
(list "q" "in" (list) {:v 20})
(list "b" "inc" (list "p") {})
(list "c" "boom" (list "q") {})
(list "d" "add" (list "b" "c") {} true))))
; ---- markers ----
(artdag-test
"fail constructor is detected"
(artdag/failed? (artdag/fail "x"))
true)
(artdag-test
"plain values are not failures"
(artdag/failed? 42)
false)
; ---- failure confinement ----
(artdag-test
"failure count covers node and its dependents"
(let
((cache (persist/open)))
(artdag/failure-count (artdag/run-safe ft-D ft-BAD cache)))
2)
(artdag-test
"failed set is exactly c and d"
(let
((cache (persist/open)))
(artdag/failed-nodes (artdag/run-safe ft-D ft-BAD cache)))
(artdag/sort-strings
(list (artdag/dag-id ft-D "c") (artdag/dag-id ft-D "d"))))
(artdag-test
"independent branch still computes"
(let
((cache (persist/open)))
(artdag/recompute-count (artdag/run-safe ft-D ft-BAD cache)))
3)
(artdag-test
"independent node result is available"
(let
((cache (persist/open)))
(artdag/result-of
(artdag/run-safe ft-D ft-BAD cache)
(artdag/dag-id ft-D "b")))
11)
(artdag-test
"all-ok? is false when something failed"
(let
((cache (persist/open)))
(artdag/all-ok? (artdag/run-safe ft-D ft-BAD cache)))
false)
(artdag-test
"all-ok? is true on a clean run"
(let
((cache (persist/open)))
(artdag/all-ok? (artdag/run-safe ft-D ft-GOOD cache)))
true)
; ---- cache integrity ----
(artdag-test
"good node is cached"
(let
((cache (persist/open)))
(begin
(artdag/run-safe ft-D ft-BAD cache)
(persist/kv-has? cache (artdag/dag-id ft-D "b"))))
true)
(artdag-test
"failed node is never cached"
(let
((cache (persist/open)))
(begin
(artdag/run-safe ft-D ft-BAD cache)
(persist/kv-has? cache (artdag/dag-id ft-D "c"))))
false)
; ---- retry after fix ----
(artdag-test
"retry recomputes only the failed closure"
(let
((cache (persist/open)))
(begin
(artdag/run-safe ft-D ft-BAD cache)
(artdag/recompute-count (artdag/run-safe ft-D ft-GOOD cache))))
2)
(artdag-test
"retry reuses the good nodes from cache"
(let
((cache (persist/open)))
(begin
(artdag/run-safe ft-D ft-BAD cache)
(artdag/hit-count (artdag/run-safe ft-D ft-GOOD cache))))
3)
(artdag-test
"retry produces the correct result"
(let
((cache (persist/open)))
(begin
(artdag/run-safe ft-D ft-BAD cache)
(artdag/result-of
(artdag/run-safe ft-D ft-GOOD cache)
(artdag/dag-id ft-D "d"))))
110)
; ---- transitive cascade ----
(artdag-test
"failure cascades through a deep chain"
(let
((cache (persist/open)))
(artdag/failure-count
(artdag/run-safe
(artdag/build
(list
(list "a" "in" (list) {:v 1})
(list "b" "boom" (list "a") {})
(list "c" "inc" (list "b") {})
(list "d" "inc" (list "c") {})))
ft-BAD
cache)))
3)

View File

@@ -30,7 +30,7 @@ edges.
## Status (rolling)
`bash lib/artdag/conformance.sh`**144/144** (9 suites: dag, analyze, plan, execute, optimize, fed, cost, serialize, stats)
`bash lib/artdag/conformance.sh`**158/158** (10 suites: dag, analyze, plan, execute, optimize, fed, cost, serialize, stats, fault)
Base roadmap (Phases 16) COMPLETE. Now extending.
@@ -138,6 +138,13 @@ lib/artdag/optimize.sx lib/artdag/federation.sx
## Progress log
- **Ext: fault-tolerant execution** (fault suite 14/14, total 158/158).
`lib/artdag/fault.sx`: a node op may fail via `(artdag/fail reason)`; `run-safe`
confines the failure to that node + its transitive dependents (independent branches
still compute) and NEVER caches a failed result, so a later run with the fault fixed
recomputes only the failed closure and cache-hits the good nodes. `failed?`/`fail`
markers, `failed-nodes`/`failure-count`/`all-ok?`.
- **Ext: execution stats / cache analytics** (stats suite 12/12, total 144/144).
`lib/artdag/stats.sx` over an exec record: `hit-ratio`, `work-recomputed`/`work-saved`
(cost-weighted via the cost model), `savings-ratio`, and `exec-summary`. Cold run =