artdag: fault-tolerant execution — confined failure, cache never poisoned + 14 tests
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 1m4s
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 1m4s
fault.sx run-safe: a node op may return (artdag/fail reason); failure is confined to that node + downstream dependents while independent branches compute, and failed results are never cached, so retry after a fix recomputes only the failed closure and hits the good nodes. fault 14/14, total 158/158. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -13,7 +13,7 @@ if [ ! -x "$SX_SERVER" ]; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
SUITES=(dag analyze plan execute optimize fed cost serialize stats)
|
SUITES=(dag analyze plan execute optimize fed cost serialize stats fault)
|
||||||
|
|
||||||
OUT_JSON="lib/artdag/scoreboard.json"
|
OUT_JSON="lib/artdag/scoreboard.json"
|
||||||
OUT_MD="lib/artdag/scoreboard.md"
|
OUT_MD="lib/artdag/scoreboard.md"
|
||||||
@@ -50,6 +50,7 @@ run_suite() {
|
|||||||
(load "lib/artdag/cost.sx")
|
(load "lib/artdag/cost.sx")
|
||||||
(load "lib/artdag/serialize.sx")
|
(load "lib/artdag/serialize.sx")
|
||||||
(load "lib/artdag/stats.sx")
|
(load "lib/artdag/stats.sx")
|
||||||
|
(load "lib/artdag/fault.sx")
|
||||||
(epoch 2)
|
(epoch 2)
|
||||||
(eval "(define artdag-test-pass 0)")
|
(eval "(define artdag-test-pass 0)")
|
||||||
(eval "(define artdag-test-fail 0)")
|
(eval "(define artdag-test-fail 0)")
|
||||||
|
|||||||
56
lib/artdag/fault.sx
Normal file
56
lib/artdag/fault.sx
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
; lib/artdag/fault.sx — fault-tolerant execution. A node op may fail by returning
|
||||||
|
; (artdag/fail reason); the failure is confined to that node and its transitive
|
||||||
|
; dependents (which cannot run without it), while independent branches still
|
||||||
|
; compute. Failed results are NEVER cached, so a later run with the fault fixed
|
||||||
|
; recomputes only the failed closure. Depends on execute.sx and plan.sx.
|
||||||
|
|
||||||
|
(define artdag/fail (fn (reason) {:artdag-fail true :reason reason}))
|
||||||
|
(define artdag/failed? (fn (v) (and (dict? v) (has-key? v :artdag-fail))))
|
||||||
|
|
||||||
|
(define
|
||||||
|
artdag/-exec-safe-node
|
||||||
|
(fn
|
||||||
|
(dag runner cache acc id)
|
||||||
|
(let
|
||||||
|
((node (artdag/dag-get dag id)))
|
||||||
|
(let
|
||||||
|
((ins (artdag/node-inputs node)))
|
||||||
|
(if
|
||||||
|
(some (fn (in) (artdag/member? in (get acc :failed))) ins)
|
||||||
|
(assoc acc :failed (concat (get acc :failed) (list id)))
|
||||||
|
(if
|
||||||
|
(persist/kv-has? cache id)
|
||||||
|
(assoc
|
||||||
|
acc
|
||||||
|
:results (assoc (get acc :results) id (persist/kv-get cache id))
|
||||||
|
:hits (concat (get acc :hits) (list id)))
|
||||||
|
(let
|
||||||
|
((inputs (map (fn (in) (artdag/-input-result (get acc :results) cache in)) ins)))
|
||||||
|
(let
|
||||||
|
((result (runner (artdag/node-op node) (artdag/node-params node) inputs)))
|
||||||
|
(if
|
||||||
|
(artdag/failed? result)
|
||||||
|
(assoc acc :failed (concat (get acc :failed) (list id)))
|
||||||
|
(begin
|
||||||
|
(persist/kv-put cache id result)
|
||||||
|
(assoc
|
||||||
|
acc
|
||||||
|
:results (assoc (get acc :results) id result)
|
||||||
|
:recomputed (concat (get acc :recomputed) (list id)))))))))))))
|
||||||
|
|
||||||
|
(define
|
||||||
|
artdag/run-safe
|
||||||
|
(fn
|
||||||
|
(dag runner cache)
|
||||||
|
(reduce
|
||||||
|
(fn (acc id) (artdag/-exec-safe-node dag runner cache acc id))
|
||||||
|
{:recomputed (list) :results {} :hits (list) :failed (list)}
|
||||||
|
(artdag/plan-flatten (artdag/plan dag 0)))))
|
||||||
|
|
||||||
|
(define
|
||||||
|
artdag/failed-nodes
|
||||||
|
(fn (exec) (artdag/sort-strings (get exec :failed))))
|
||||||
|
(define artdag/failure-count (fn (exec) (len (get exec :failed))))
|
||||||
|
(define
|
||||||
|
artdag/all-ok?
|
||||||
|
(fn (exec) (= (len (get exec :failed)) 0)))
|
||||||
@@ -8,9 +8,10 @@
|
|||||||
"fed": {"pass": 15, "fail": 0},
|
"fed": {"pass": 15, "fail": 0},
|
||||||
"cost": {"pass": 13, "fail": 0},
|
"cost": {"pass": 13, "fail": 0},
|
||||||
"serialize": {"pass": 13, "fail": 0},
|
"serialize": {"pass": 13, "fail": 0},
|
||||||
"stats": {"pass": 12, "fail": 0}
|
"stats": {"pass": 12, "fail": 0},
|
||||||
|
"fault": {"pass": 14, "fail": 0}
|
||||||
},
|
},
|
||||||
"total_pass": 144,
|
"total_pass": 158,
|
||||||
"total_fail": 0,
|
"total_fail": 0,
|
||||||
"total": 144
|
"total": 158
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -13,4 +13,5 @@ _Generated by `lib/artdag/conformance.sh`_
|
|||||||
| cost | 13 | 0 | 13 |
|
| cost | 13 | 0 | 13 |
|
||||||
| serialize | 13 | 0 | 13 |
|
| serialize | 13 | 0 | 13 |
|
||||||
| stats | 12 | 0 | 12 |
|
| stats | 12 | 0 | 12 |
|
||||||
| **Total** | **144** | **0** | **144** |
|
| fault | 14 | 0 | 14 |
|
||||||
|
| **Total** | **158** | **0** | **158** |
|
||||||
|
|||||||
144
lib/artdag/tests/fault.sx
Normal file
144
lib/artdag/tests/fault.sx
Normal file
@@ -0,0 +1,144 @@
|
|||||||
|
; fault-tolerant execution: failure confined to its closure, cache never poisoned.
|
||||||
|
|
||||||
|
(define ft-BAD (artdag/op-table-runner {:boom (fn (p i) (artdag/fail "kaboom")) :in (fn (p i) (get p :v)) :add (fn (p i) (+ (nth i 0) (nth i 1))) :inc (fn (p i) (+ 1 (first i)))}))
|
||||||
|
|
||||||
|
(define ft-GOOD (artdag/op-table-runner {:boom (fn (p i) 99) :in (fn (p i) (get p :v)) :add (fn (p i) (+ (nth i 0) (nth i 1))) :inc (fn (p i) (+ 1 (first i)))}))
|
||||||
|
|
||||||
|
; p,q leaves; b=inc(p) (independent); c=boom(q); d=add(b,c)
|
||||||
|
(define
|
||||||
|
ft-D
|
||||||
|
(artdag/build
|
||||||
|
(list
|
||||||
|
(list "p" "in" (list) {:v 10})
|
||||||
|
(list "q" "in" (list) {:v 20})
|
||||||
|
(list "b" "inc" (list "p") {})
|
||||||
|
(list "c" "boom" (list "q") {})
|
||||||
|
(list "d" "add" (list "b" "c") {} true))))
|
||||||
|
|
||||||
|
; ---- markers ----
|
||||||
|
|
||||||
|
(artdag-test
|
||||||
|
"fail constructor is detected"
|
||||||
|
(artdag/failed? (artdag/fail "x"))
|
||||||
|
true)
|
||||||
|
|
||||||
|
(artdag-test
|
||||||
|
"plain values are not failures"
|
||||||
|
(artdag/failed? 42)
|
||||||
|
false)
|
||||||
|
|
||||||
|
; ---- failure confinement ----
|
||||||
|
|
||||||
|
(artdag-test
|
||||||
|
"failure count covers node and its dependents"
|
||||||
|
(let
|
||||||
|
((cache (persist/open)))
|
||||||
|
(artdag/failure-count (artdag/run-safe ft-D ft-BAD cache)))
|
||||||
|
2)
|
||||||
|
|
||||||
|
(artdag-test
|
||||||
|
"failed set is exactly c and d"
|
||||||
|
(let
|
||||||
|
((cache (persist/open)))
|
||||||
|
(artdag/failed-nodes (artdag/run-safe ft-D ft-BAD cache)))
|
||||||
|
(artdag/sort-strings
|
||||||
|
(list (artdag/dag-id ft-D "c") (artdag/dag-id ft-D "d"))))
|
||||||
|
|
||||||
|
(artdag-test
|
||||||
|
"independent branch still computes"
|
||||||
|
(let
|
||||||
|
((cache (persist/open)))
|
||||||
|
(artdag/recompute-count (artdag/run-safe ft-D ft-BAD cache)))
|
||||||
|
3)
|
||||||
|
|
||||||
|
(artdag-test
|
||||||
|
"independent node result is available"
|
||||||
|
(let
|
||||||
|
((cache (persist/open)))
|
||||||
|
(artdag/result-of
|
||||||
|
(artdag/run-safe ft-D ft-BAD cache)
|
||||||
|
(artdag/dag-id ft-D "b")))
|
||||||
|
11)
|
||||||
|
|
||||||
|
(artdag-test
|
||||||
|
"all-ok? is false when something failed"
|
||||||
|
(let
|
||||||
|
((cache (persist/open)))
|
||||||
|
(artdag/all-ok? (artdag/run-safe ft-D ft-BAD cache)))
|
||||||
|
false)
|
||||||
|
|
||||||
|
(artdag-test
|
||||||
|
"all-ok? is true on a clean run"
|
||||||
|
(let
|
||||||
|
((cache (persist/open)))
|
||||||
|
(artdag/all-ok? (artdag/run-safe ft-D ft-GOOD cache)))
|
||||||
|
true)
|
||||||
|
|
||||||
|
; ---- cache integrity ----
|
||||||
|
|
||||||
|
(artdag-test
|
||||||
|
"good node is cached"
|
||||||
|
(let
|
||||||
|
((cache (persist/open)))
|
||||||
|
(begin
|
||||||
|
(artdag/run-safe ft-D ft-BAD cache)
|
||||||
|
(persist/kv-has? cache (artdag/dag-id ft-D "b"))))
|
||||||
|
true)
|
||||||
|
|
||||||
|
(artdag-test
|
||||||
|
"failed node is never cached"
|
||||||
|
(let
|
||||||
|
((cache (persist/open)))
|
||||||
|
(begin
|
||||||
|
(artdag/run-safe ft-D ft-BAD cache)
|
||||||
|
(persist/kv-has? cache (artdag/dag-id ft-D "c"))))
|
||||||
|
false)
|
||||||
|
|
||||||
|
; ---- retry after fix ----
|
||||||
|
|
||||||
|
(artdag-test
|
||||||
|
"retry recomputes only the failed closure"
|
||||||
|
(let
|
||||||
|
((cache (persist/open)))
|
||||||
|
(begin
|
||||||
|
(artdag/run-safe ft-D ft-BAD cache)
|
||||||
|
(artdag/recompute-count (artdag/run-safe ft-D ft-GOOD cache))))
|
||||||
|
2)
|
||||||
|
|
||||||
|
(artdag-test
|
||||||
|
"retry reuses the good nodes from cache"
|
||||||
|
(let
|
||||||
|
((cache (persist/open)))
|
||||||
|
(begin
|
||||||
|
(artdag/run-safe ft-D ft-BAD cache)
|
||||||
|
(artdag/hit-count (artdag/run-safe ft-D ft-GOOD cache))))
|
||||||
|
3)
|
||||||
|
|
||||||
|
(artdag-test
|
||||||
|
"retry produces the correct result"
|
||||||
|
(let
|
||||||
|
((cache (persist/open)))
|
||||||
|
(begin
|
||||||
|
(artdag/run-safe ft-D ft-BAD cache)
|
||||||
|
(artdag/result-of
|
||||||
|
(artdag/run-safe ft-D ft-GOOD cache)
|
||||||
|
(artdag/dag-id ft-D "d"))))
|
||||||
|
110)
|
||||||
|
|
||||||
|
; ---- transitive cascade ----
|
||||||
|
|
||||||
|
(artdag-test
|
||||||
|
"failure cascades through a deep chain"
|
||||||
|
(let
|
||||||
|
((cache (persist/open)))
|
||||||
|
(artdag/failure-count
|
||||||
|
(artdag/run-safe
|
||||||
|
(artdag/build
|
||||||
|
(list
|
||||||
|
(list "a" "in" (list) {:v 1})
|
||||||
|
(list "b" "boom" (list "a") {})
|
||||||
|
(list "c" "inc" (list "b") {})
|
||||||
|
(list "d" "inc" (list "c") {})))
|
||||||
|
ft-BAD
|
||||||
|
cache)))
|
||||||
|
3)
|
||||||
@@ -30,7 +30,7 @@ edges.
|
|||||||
|
|
||||||
## Status (rolling)
|
## Status (rolling)
|
||||||
|
|
||||||
`bash lib/artdag/conformance.sh` → **144/144** (9 suites: dag, analyze, plan, execute, optimize, fed, cost, serialize, stats)
|
`bash lib/artdag/conformance.sh` → **158/158** (10 suites: dag, analyze, plan, execute, optimize, fed, cost, serialize, stats, fault)
|
||||||
|
|
||||||
Base roadmap (Phases 1–6) COMPLETE. Now extending.
|
Base roadmap (Phases 1–6) COMPLETE. Now extending.
|
||||||
|
|
||||||
@@ -138,6 +138,13 @@ lib/artdag/optimize.sx lib/artdag/federation.sx
|
|||||||
|
|
||||||
## Progress log
|
## Progress log
|
||||||
|
|
||||||
|
- **Ext: fault-tolerant execution** (fault suite 14/14, total 158/158).
|
||||||
|
`lib/artdag/fault.sx`: a node op may fail via `(artdag/fail reason)`; `run-safe`
|
||||||
|
confines the failure to that node + its transitive dependents (independent branches
|
||||||
|
still compute) and NEVER caches a failed result, so a later run with the fault fixed
|
||||||
|
recomputes only the failed closure and cache-hits the good nodes. `failed?`/`fail`
|
||||||
|
markers, `failed-nodes`/`failure-count`/`all-ok?`.
|
||||||
|
|
||||||
- **Ext: execution stats / cache analytics** (stats suite 12/12, total 144/144).
|
- **Ext: execution stats / cache analytics** (stats suite 12/12, total 144/144).
|
||||||
`lib/artdag/stats.sx` over an exec record: `hit-ratio`, `work-recomputed`/`work-saved`
|
`lib/artdag/stats.sx` over an exec record: `hit-ratio`, `work-recomputed`/`work-saved`
|
||||||
(cost-weighted via the cost model), `savings-ratio`, and `exec-summary`. Cold run =
|
(cost-weighted via the cost model), `savings-ratio`, and `exec-summary`. Cold run =
|
||||||
|
|||||||
Reference in New Issue
Block a user