artdag: fault-tolerant execution — confined failure, cache never poisoned + 14 tests
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 1m4s
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 1m4s
fault.sx run-safe: a node op may return (artdag/fail reason); failure is confined to that node + downstream dependents while independent branches compute, and failed results are never cached, so retry after a fix recomputes only the failed closure and hits the good nodes. fault 14/14, total 158/158. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
56
lib/artdag/fault.sx
Normal file
56
lib/artdag/fault.sx
Normal file
@@ -0,0 +1,56 @@
|
||||
; lib/artdag/fault.sx — fault-tolerant execution. A node op may fail by returning
|
||||
; (artdag/fail reason); the failure is confined to that node and its transitive
|
||||
; dependents (which cannot run without it), while independent branches still
|
||||
; compute. Failed results are NEVER cached, so a later run with the fault fixed
|
||||
; recomputes only the failed closure. Depends on execute.sx and plan.sx.
|
||||
|
||||
(define artdag/fail (fn (reason) {:artdag-fail true :reason reason}))
|
||||
(define artdag/failed? (fn (v) (and (dict? v) (has-key? v :artdag-fail))))
|
||||
|
||||
(define
|
||||
artdag/-exec-safe-node
|
||||
(fn
|
||||
(dag runner cache acc id)
|
||||
(let
|
||||
((node (artdag/dag-get dag id)))
|
||||
(let
|
||||
((ins (artdag/node-inputs node)))
|
||||
(if
|
||||
(some (fn (in) (artdag/member? in (get acc :failed))) ins)
|
||||
(assoc acc :failed (concat (get acc :failed) (list id)))
|
||||
(if
|
||||
(persist/kv-has? cache id)
|
||||
(assoc
|
||||
acc
|
||||
:results (assoc (get acc :results) id (persist/kv-get cache id))
|
||||
:hits (concat (get acc :hits) (list id)))
|
||||
(let
|
||||
((inputs (map (fn (in) (artdag/-input-result (get acc :results) cache in)) ins)))
|
||||
(let
|
||||
((result (runner (artdag/node-op node) (artdag/node-params node) inputs)))
|
||||
(if
|
||||
(artdag/failed? result)
|
||||
(assoc acc :failed (concat (get acc :failed) (list id)))
|
||||
(begin
|
||||
(persist/kv-put cache id result)
|
||||
(assoc
|
||||
acc
|
||||
:results (assoc (get acc :results) id result)
|
||||
:recomputed (concat (get acc :recomputed) (list id)))))))))))))
|
||||
|
||||
(define
|
||||
artdag/run-safe
|
||||
(fn
|
||||
(dag runner cache)
|
||||
(reduce
|
||||
(fn (acc id) (artdag/-exec-safe-node dag runner cache acc id))
|
||||
{:recomputed (list) :results {} :hits (list) :failed (list)}
|
||||
(artdag/plan-flatten (artdag/plan dag 0)))))
|
||||
|
||||
(define
|
||||
artdag/failed-nodes
|
||||
(fn (exec) (artdag/sort-strings (get exec :failed))))
|
||||
(define artdag/failure-count (fn (exec) (len (get exec :failed))))
|
||||
(define
|
||||
artdag/all-ok?
|
||||
(fn (exec) (= (len (get exec :failed)) 0)))
|
||||
Reference in New Issue
Block a user