diff --git a/lib/datalog/api.sx b/lib/datalog/api.sx index 01abec95..a76d3624 100644 --- a/lib/datalog/api.sx +++ b/lib/datalog/api.sx @@ -112,7 +112,8 @@ (let ((existing (get (get db :facts) rel-key)) (kept (list)) - (kept-keys {})) + (kept-keys {}) + (kept-index {})) (do (for-each (fn @@ -121,10 +122,19 @@ (not (dl-tuple-equal? t lit)) (do (append! kept t) - (dict-set! kept-keys (dl-tuple-key t) true)))) + (dict-set! kept-keys (dl-tuple-key t) true) + (when + (>= (len t) 2) + (let ((k (dl-arg-key (nth t 1)))) + (do + (when + (not (has-key? kept-index k)) + (dict-set! kept-index k (list))) + (append! (get kept-index k) t))))))) existing) (dict-set! (get db :facts) rel-key kept) - (dict-set! (get db :facts-keys) rel-key kept-keys)))) + (dict-set! (get db :facts-keys) rel-key kept-keys) + (dict-set! (get db :facts-index) rel-key kept-index)))) ;; Wipe all relations that have a rule (these are IDB) so the ;; saturator regenerates them from the surviving EDB. (let ((rule-heads (dl-rule-head-rels db))) @@ -133,7 +143,8 @@ (k) (do (dict-set! (get db :facts) k (list)) - (dict-set! (get db :facts-keys) k {}))) + (dict-set! (get db :facts-keys) k {}) + (dict-set! (get db :facts-index) k {}))) rule-heads)) (dl-saturate! db) db)))) diff --git a/lib/datalog/db.sx b/lib/datalog/db.sx index b5aea9b8..09f92315 100644 --- a/lib/datalog/db.sx +++ b/lib/datalog/db.sx @@ -12,7 +12,7 @@ ;; lib/datalog/builtins.sx) swaps in the real `dl-rule-check-safety`, ;; which is order-aware and understands built-in predicates. -(define dl-make-db (fn () {:facts {} :facts-keys {} :rules (list)})) +(define dl-make-db (fn () {:facts {} :facts-keys {} :facts-index {} :rules (list)})) (define dl-rel-name @@ -98,7 +98,8 @@ (db rel-key) (let ((facts (get db :facts)) - (fk (get db :facts-keys))) + (fk (get db :facts-keys)) + (fi (get db :facts-index))) (do (when (not (has-key? facts rel-key)) @@ -106,8 +107,51 @@ (when (not (has-key? fk rel-key)) (dict-set! fk rel-key {})) + (when + (not (has-key? fi rel-key)) + (dict-set! fi rel-key {})) (get facts rel-key))))) +;; First-arg index helpers. Tuples are keyed by their first-after-rel +;; arg's `(str ...)`; when that arg is a constant, dl-match-positive +;; uses the index instead of scanning the full relation. +(define + dl-arg-key + (fn + (v) + (str v))) + +(define + dl-index-add! + (fn + (db rel-key lit) + (let + ((idx (get db :facts-index)) + (n (len lit))) + (when + (and (>= n 2) (has-key? idx rel-key)) + (let + ((rel-idx (get idx rel-key)) + (k (dl-arg-key (nth lit 1)))) + (do + (when + (not (has-key? rel-idx k)) + (dict-set! rel-idx k (list))) + (append! (get rel-idx k) lit))))))) + +(define + dl-index-lookup + (fn + (db rel-key arg-val) + (let + ((idx (get db :facts-index))) + (cond + ((not (has-key? idx rel-key)) (list)) + (else + (let ((rel-idx (get idx rel-key)) + (k (dl-arg-key arg-val))) + (if (has-key? rel-idx k) (get rel-idx k) (list)))))))) + (define dl-tuple-key (fn (lit) (str lit))) (define @@ -140,6 +184,7 @@ (do (dict-set! key-dict tk true) (append! tuples lit) + (dl-index-add! db rel-key lit) true))))))))) ;; The full safety check lives in builtins.sx (it has to know which diff --git a/lib/datalog/eval.sx b/lib/datalog/eval.sx index 476fd0d0..b8ccd83f 100644 --- a/lib/datalog/eval.sx +++ b/lib/datalog/eval.sx @@ -24,7 +24,17 @@ ((nil? rel) (error (str "dl-match-positive: bad literal " lit))) (else (let - ((tuples (dl-rel-tuples db rel))) + ;; If the first argument walks to a non-variable (constant + ;; or already-bound var), use the first-arg index for + ;; this relation. Otherwise scan the full tuple list. + ((tuples + (cond + ((>= (len lit) 2) + (let ((walked (dl-walk (nth lit 1) subst))) + (cond + ((dl-var? walked) (dl-rel-tuples db rel)) + (else (dl-index-lookup db rel walked))))) + (else (dl-rel-tuples db rel))))) (do (for-each (fn diff --git a/lib/datalog/scoreboard.json b/lib/datalog/scoreboard.json index 2545002b..3bc3e1e5 100644 --- a/lib/datalog/scoreboard.json +++ b/lib/datalog/scoreboard.json @@ -15,5 +15,5 @@ {"name":"api","passed":11,"failed":0,"total":11}, {"name":"demo","passed":15,"failed":0,"total":15} ], - "generated": "2026-05-08T09:20:09+00:00" + "generated": "2026-05-08T09:27:29+00:00" } diff --git a/lib/datalog/tests/semi_naive.sx b/lib/datalog/tests/semi_naive.sx index 914dcb2b..6df2a9e5 100644 --- a/lib/datalog/tests/semi_naive.sx +++ b/lib/datalog/tests/semi_naive.sx @@ -120,18 +120,20 @@ (dl-sn-counts-agree? (dl-sn-counts "p(a). p(b). q(X) :- p(X), =(X, a).")) true) - ;; Chain length 10 — exercises multiple semi-naive iterations - ;; against the recursive ancestor rule. + ;; Chain length 12 — multiple semi-naive iterations against + ;; the recursive ancestor rule (differential vs naive). (dl-sn-test! - "chain-10 ancestor counts match" - (dl-sn-counts-agree? (dl-sn-counts (dl-sn-chain-source 10))) + "chain-12 ancestor counts match" + (dl-sn-counts-agree? (dl-sn-counts (dl-sn-chain-source 12))) true) + ;; Chain length 25 — semi-naive only — first-arg index makes + ;; this tractable in conformance budget. (dl-sn-test! - "chain-15 ancestor count value (semi only)" + "chain-25 ancestor count value (semi only)" (let - ((db (dl-program (dl-sn-chain-source 15)))) + ((db (dl-program (dl-sn-chain-source 25)))) (do (dl-saturate! db) (len (dl-relation db "ancestor")))) - 120) + 325) (dl-sn-test! "query through semi saturate" (let diff --git a/plans/datalog-on-sx.md b/plans/datalog-on-sx.md index f0233eec..c92f6e78 100644 --- a/plans/datalog-on-sx.md +++ b/plans/datalog-on-sx.md @@ -264,19 +264,33 @@ large graphs. ## Blockers -- **Saturation perf** improving but not free. Resolved hash-set - membership in `dl-add-fact!` and replaced recursive `(rest lits)` in - `dl-find-bindings` with indexed iteration. chain-15 drops from ~25s - to ~16s and chain-25 saturates in ~33s real / 11s user — most CPU - now in unification (assoc-based subst dict copies) and dict - lookups during walks. Future: a per-rule "compiled" body that - pre-resolves arg positions and intern variable indices, then - unification can use array slots instead of dict assoc. +- **Saturation perf**: three rounds done. + - hash-set membership in `dl-add-fact!` (Phase 5b) + - indexed iteration in `dl-find-bindings` (Phase 5c) + - first-arg index per relation (Phase 5e) — when a body literal's + first arg walks to a non-variable, dl-match-positive looks up + by `(str arg)` instead of scanning the full relation. + chain-25 saturation drops from ~33s to ~18s real (10s user). + chain-50 still long (~120s+) due to dict-copy overhead in + unification subst threading. Future: per-rule "compiled" body + with pre-resolved var positions, slot-based subst representation + to avoid `assoc` per binding. ## Progress log _Newest first._ +- 2026-05-08 — Phase 5e perf: first-arg index per relation. db gains + `:facts-index {: {: tuples}}` mirroring the + existing `:facts-keys` membership index. `dl-add-fact!` populates + it; `dl-match-positive` walks the body literal's first arg under + the current subst — if it's bound to a non-var, look up by + `(str arg)` and iterate only the matching subset. chain-25 + saturation 33s → 18s real (~2x). chain-50 still slow (~120s+) + but tractable; next bottleneck is subst dict copies during + unification. Differential test bumped to chain-12, semi-only + count to chain-25. + - 2026-05-08 — Demo: tag co-occurrence. `(cotagged P T1 T2)` — post has both T1 and T2 with T1 != T2 — and `(tag-pair-count T1 T2 N)` counting posts per distinct tag pair. Demonstrates count