From 6d04cf7bf24c4822c0747043359d102d606d3656 Mon Sep 17 00:00:00 2001 From: giles Date: Fri, 8 May 2026 08:28:45 +0000 Subject: [PATCH] datalog: aggregation count/sum/min/max (Phase 8, 134/134) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New lib/datalog/aggregates.sx: (count R V Goal), (sum R V Goal), (min R V Goal), (max R V Goal). dl-eval-aggregate runs dl-find-bindings on the goal under the outer subst, collects distinct values of V, applies the operator, binds R. Empty input: count/sum return 0; min/max produce no binding (rule fails). Group-by emerges naturally from outer-subst substitution into the goal — `popular(P) :- post(P), count(N, U, liked(U, P)), >=(N, 3).` counts per-post. Stratifier extended: dl-aggregate-dep-edge contributes a negation-like edge so the aggregate's goal relation is fully derived before the aggregate fires (non-monotonicity respected). Safety relaxed for aggregates: goal-internal vars are existentials, only the result var becomes bound. --- lib/datalog/aggregates.sx | 139 ++++++++++++++++++++ lib/datalog/builtins.sx | 14 ++ lib/datalog/conformance.conf | 2 + lib/datalog/eval.sx | 1 + lib/datalog/scoreboard.json | 9 +- lib/datalog/scoreboard.md | 3 +- lib/datalog/strata.sx | 43 +++--- lib/datalog/tests/aggregates.sx | 223 ++++++++++++++++++++++++++++++++ plans/datalog-on-sx.md | 40 +++++- 9 files changed, 447 insertions(+), 27 deletions(-) create mode 100644 lib/datalog/aggregates.sx create mode 100644 lib/datalog/tests/aggregates.sx diff --git a/lib/datalog/aggregates.sx b/lib/datalog/aggregates.sx new file mode 100644 index 00000000..153879f7 --- /dev/null +++ b/lib/datalog/aggregates.sx @@ -0,0 +1,139 @@ +;; lib/datalog/aggregates.sx — count / sum / min / max aggregation. +;; +;; Surface form (always 3-arg in the parsed AST): +;; +;; (count Result Var GoalLit) +;; (sum Result Var GoalLit) +;; (min Result Var GoalLit) +;; (max Result Var GoalLit) +;; +;; Parsed naturally because arg-position compounds are already allowed +;; (Phase 4 needs them for arithmetic). At evaluation time the aggregator +;; runs `dl-find-bindings` on `GoalLit` under the current subst, collects +;; the distinct values of `Var`, and binds `Result`. +;; +;; Aggregation is non-monotonic — `count(C, X, p(X))` shrinks as p loses +;; tuples. The stratifier (lib/datalog/strata.sx) treats every aggregate's +;; goal relation as a negation-like edge so the inner relation is fully +;; derived before the aggregate fires. +;; +;; Empty input: count → 0, sum → 0, min/max → no binding (rule fails). + +(define dl-aggregate-rels (list "count" "sum" "min" "max")) + +(define + dl-aggregate? + (fn + (lit) + (and + (list? lit) + (>= (len lit) 4) + (let ((rel (dl-rel-name lit))) + (cond + ((nil? rel) false) + (else (dl-member-string? rel dl-aggregate-rels))))))) + +;; Apply aggregation operator to a list of (already-distinct) numeric or +;; symbolic values. Returns the aggregated value, or :empty if min/max +;; has no input. +(define + dl-do-aggregate + (fn + (op vals) + (cond + ((= op "count") (len vals)) + ((= op "sum") (dl-sum-vals vals 0)) + ((= op "min") + (cond + ((= (len vals) 0) :empty) + (else (dl-min-vals vals 1 (first vals))))) + ((= op "max") + (cond + ((= (len vals) 0) :empty) + (else (dl-max-vals vals 1 (first vals))))) + (else (error (str "datalog: unknown aggregate " op)))))) + +(define + dl-sum-vals + (fn + (vals acc) + (cond + ((= (len vals) 0) acc) + (else (dl-sum-vals (rest vals) (+ acc (first vals))))))) + +(define + dl-min-vals + (fn + (vals i cur) + (cond + ((>= i (len vals)) cur) + (else + (let ((v (nth vals i))) + (dl-min-vals vals (+ i 1) (if (< v cur) v cur))))))) + +(define + dl-max-vals + (fn + (vals i cur) + (cond + ((>= i (len vals)) cur) + (else + (let ((v (nth vals i))) + (dl-max-vals vals (+ i 1) (if (> v cur) v cur))))))) + +;; Membership check by deep equality (so 30 == 30.0 etc). +(define + dl-val-member? + (fn + (v xs) + (cond + ((= (len xs) 0) false) + ((dl-tuple-equal? v (first xs)) true) + (else (dl-val-member? v (rest xs)))))) + +;; Evaluate an aggregate body lit under `subst`. Returns the list of +;; extended substitutions (0 or 1 element). +(define + dl-eval-aggregate + (fn + (lit db subst) + (let + ((op (dl-rel-name lit)) + (result-var (nth lit 1)) + (agg-var (nth lit 2)) + (goal (nth lit 3))) + (let + ((substs (dl-find-bindings (list goal) db subst)) + (vals (list))) + (do + (for-each + (fn + (s) + (let ((v (dl-apply-subst agg-var s))) + (when + (not (dl-val-member? v vals)) + (append! vals v)))) + substs) + (let ((agg-val (dl-do-aggregate op vals))) + (cond + ((= agg-val :empty) (list)) + (else + (let ((s2 (dl-unify result-var agg-val subst))) + (if (nil? s2) (list) (list s2))))))))))) + +;; Stratification edges from aggregates: like negation, the goal's +;; relation must be in a strictly lower stratum so that the aggregate +;; fires only after the underlying tuples are settled. +(define + dl-aggregate-dep-edge + (fn + (lit) + (cond + ((dl-aggregate? lit) + (let ((goal (nth lit 3))) + (cond + ((and (list? goal) (> (len goal) 0)) + (let ((rel (dl-rel-name goal))) + (if (nil? rel) nil {:rel rel :neg true}))) + (else nil)))) + (else nil)))) diff --git a/lib/datalog/builtins.sx b/lib/datalog/builtins.sx index dfd2307a..11261c85 100644 --- a/lib/datalog/builtins.sx +++ b/lib/datalog/builtins.sx @@ -262,6 +262,19 @@ missing " — they must be bound by an earlier " "positive body literal"))))))) + (define + dl-process-agg! + (fn + (lit) + (let + ((result-var (nth lit 1))) + ;; Aggregate goal vars are existentially quantified within + ;; the aggregate; nothing required from outer context. The + ;; result var becomes bound after the aggregate fires. + (when + (dl-var? result-var) + (dl-add-bound! (list (symbol->string result-var))))))) + (define dl-process-lit! (fn @@ -271,6 +284,7 @@ (cond ((and (dict? lit) (has-key? lit :neg)) (dl-process-neg! lit)) + ((dl-aggregate? lit) (dl-process-agg! lit)) ((dl-eq? lit) (dl-process-eq! lit)) ((dl-is? lit) (dl-process-is! lit)) ((dl-comparison? lit) (dl-process-cmp! lit)) diff --git a/lib/datalog/conformance.conf b/lib/datalog/conformance.conf index bd9b0612..15dea37e 100644 --- a/lib/datalog/conformance.conf +++ b/lib/datalog/conformance.conf @@ -9,6 +9,7 @@ PRELOADS=( lib/datalog/unify.sx lib/datalog/db.sx lib/datalog/builtins.sx + lib/datalog/aggregates.sx lib/datalog/strata.sx lib/datalog/eval.sx ) @@ -21,4 +22,5 @@ SUITES=( "builtins:lib/datalog/tests/builtins.sx:(dl-builtins-tests-run!)" "semi_naive:lib/datalog/tests/semi_naive.sx:(dl-semi-naive-tests-run!)" "negation:lib/datalog/tests/negation.sx:(dl-negation-tests-run!)" + "aggregates:lib/datalog/tests/aggregates.sx:(dl-aggregates-tests-run!)" ) diff --git a/lib/datalog/eval.sx b/lib/datalog/eval.sx index 45c0fdb9..93902349 100644 --- a/lib/datalog/eval.sx +++ b/lib/datalog/eval.sx @@ -206,6 +206,7 @@ (cond ((and (dict? lit) (has-key? lit :neg)) (dl-match-negation (get lit :neg) db subst)) + ((dl-aggregate? lit) (dl-eval-aggregate lit db subst)) ((dl-builtin? lit) (let ((s (dl-eval-builtin lit subst))) diff --git a/lib/datalog/scoreboard.json b/lib/datalog/scoreboard.json index 54be5726..54b1560e 100644 --- a/lib/datalog/scoreboard.json +++ b/lib/datalog/scoreboard.json @@ -1,8 +1,8 @@ { "lang": "datalog", - "total_passed": 124, + "total_passed": 134, "total_failed": 0, - "total": 124, + "total": 134, "suites": [ {"name":"tokenize","passed":26,"failed":0,"total":26}, {"name":"parse","passed":18,"failed":0,"total":18}, @@ -10,7 +10,8 @@ {"name":"eval","passed":15,"failed":0,"total":15}, {"name":"builtins","passed":19,"failed":0,"total":19}, {"name":"semi_naive","passed":8,"failed":0,"total":8}, - {"name":"negation","passed":10,"failed":0,"total":10} + {"name":"negation","passed":10,"failed":0,"total":10}, + {"name":"aggregates","passed":10,"failed":0,"total":10} ], - "generated": "2026-05-08T08:20:41+00:00" + "generated": "2026-05-08T08:28:29+00:00" } diff --git a/lib/datalog/scoreboard.md b/lib/datalog/scoreboard.md index 802305ab..22cd5104 100644 --- a/lib/datalog/scoreboard.md +++ b/lib/datalog/scoreboard.md @@ -1,6 +1,6 @@ # datalog scoreboard -**124 / 124 passing** (0 failure(s)). +**134 / 134 passing** (0 failure(s)). | Suite | Passed | Total | Status | |-------|--------|-------|--------| @@ -11,3 +11,4 @@ | builtins | 19 | 19 | ok | | semi_naive | 8 | 8 | ok | | negation | 10 | 10 | ok | +| aggregates | 10 | 10 | ok | diff --git a/lib/datalog/strata.sx b/lib/datalog/strata.sx index b5d08347..0928c989 100644 --- a/lib/datalog/strata.sx +++ b/lib/datalog/strata.sx @@ -33,22 +33,30 @@ (for-each (fn (lit) - (let - ((target - (cond - ((and (dict? lit) (has-key? lit :neg)) - (dl-rel-name (get lit :neg))) - ((dl-builtin? lit) nil) - ((and (list? lit) (> (len lit) 0)) - (dl-rel-name lit)) - (else nil))) - (neg? - (and (dict? lit) (has-key? lit :neg)))) - (when - (not (nil? target)) - (append! - existing - {:rel target :neg neg?})))) + (cond + ((dl-aggregate? lit) + (let + ((edge (dl-aggregate-dep-edge lit))) + (when + (not (nil? edge)) + (append! existing edge)))) + (else + (let + ((target + (cond + ((and (dict? lit) (has-key? lit :neg)) + (dl-rel-name (get lit :neg))) + ((dl-builtin? lit) nil) + ((and (list? lit) (> (len lit) 0)) + (dl-rel-name lit)) + (else nil))) + (neg? + (and (dict? lit) (has-key? lit :neg)))) + (when + (not (nil? target)) + (append! + existing + {:rel target :neg neg?})))))) (get rule :body))))))) (dl-rules db)) g)))) @@ -79,6 +87,9 @@ (let ((t (cond + ((dl-aggregate? lit) + (let ((edge (dl-aggregate-dep-edge lit))) + (if (nil? edge) nil (get edge :rel)))) ((and (dict? lit) (has-key? lit :neg)) (dl-rel-name (get lit :neg))) ((dl-builtin? lit) nil) diff --git a/lib/datalog/tests/aggregates.sx b/lib/datalog/tests/aggregates.sx new file mode 100644 index 00000000..d80276d8 --- /dev/null +++ b/lib/datalog/tests/aggregates.sx @@ -0,0 +1,223 @@ +;; lib/datalog/tests/aggregates.sx — count / sum / min / max. + +(define dl-at-pass 0) +(define dl-at-fail 0) +(define dl-at-failures (list)) + +(define + dl-at-deep=? + (fn + (a b) + (cond + ((and (list? a) (list? b)) + (and (= (len a) (len b)) (dl-at-deq-l? a b 0))) + ((and (dict? a) (dict? b)) + (let ((ka (keys a)) (kb (keys b))) + (and (= (len ka) (len kb)) (dl-at-deq-d? a b ka 0)))) + ((and (number? a) (number? b)) (= a b)) + (else (equal? a b))))) + +(define + dl-at-deq-l? + (fn + (a b i) + (cond + ((>= i (len a)) true) + ((not (dl-at-deep=? (nth a i) (nth b i))) false) + (else (dl-at-deq-l? a b (+ i 1)))))) + +(define + dl-at-deq-d? + (fn + (a b ka i) + (cond + ((>= i (len ka)) true) + ((let ((k (nth ka i))) + (not (dl-at-deep=? (get a k) (get b k)))) + false) + (else (dl-at-deq-d? a b ka (+ i 1)))))) + +(define + dl-at-set=? + (fn + (a b) + (and + (= (len a) (len b)) + (dl-at-subset? a b) + (dl-at-subset? b a)))) + +(define + dl-at-subset? + (fn + (xs ys) + (cond + ((= (len xs) 0) true) + ((not (dl-at-contains? ys (first xs))) false) + (else (dl-at-subset? (rest xs) ys))))) + +(define + dl-at-contains? + (fn + (xs target) + (cond + ((= (len xs) 0) false) + ((dl-at-deep=? (first xs) target) true) + (else (dl-at-contains? (rest xs) target))))) + +(define + dl-at-test! + (fn + (name got expected) + (if + (dl-at-deep=? got expected) + (set! dl-at-pass (+ dl-at-pass 1)) + (do + (set! dl-at-fail (+ dl-at-fail 1)) + (append! + dl-at-failures + (str + name + "\n expected: " expected + "\n got: " got)))))) + +(define + dl-at-test-set! + (fn + (name got expected) + (if + (dl-at-set=? got expected) + (set! dl-at-pass (+ dl-at-pass 1)) + (do + (set! dl-at-fail (+ dl-at-fail 1)) + (append! + dl-at-failures + (str + name + "\n expected (set): " expected + "\n got: " got)))))) + +(define + dl-at-throws? + (fn + (thunk) + (let + ((threw false)) + (do + (guard + (e (#t (set! threw true))) + (thunk)) + threw)))) + +(define + dl-at-run-all! + (fn + () + (do + ;; count + (dl-at-test-set! "count siblings" + (dl-query + (dl-program + "parent(p, bob). parent(p, alice). parent(p, charlie). + sibling(X, Y) :- parent(P, X), parent(P, Y), !=(X, Y). + sib_count(N) :- count(N, S, sibling(bob, S)).") + (list (quote sib_count) (quote N))) + (list {:N 2})) + + ;; sum + (dl-at-test-set! "sum prices" + (dl-query + (dl-program + "price(apple, 5). price(pear, 7). price(plum, 3). + total(T) :- sum(T, X, price(F, X)).") + (list (quote total) (quote T))) + (list {:T 15})) + + ;; min + (dl-at-test-set! "min score" + (dl-query + (dl-program + "score(alice, 80). score(bob, 65). score(carol, 92). + lo(M) :- min(M, S, score(P, S)).") + (list (quote lo) (quote M))) + (list {:M 65})) + + ;; max + (dl-at-test-set! "max score" + (dl-query + (dl-program + "score(alice, 80). score(bob, 65). score(carol, 92). + hi(M) :- max(M, S, score(P, S)).") + (list (quote hi) (quote M))) + (list {:M 92})) + + ;; count over derived relation (stratification needed). + (dl-at-test-set! "count over derived" + (dl-query + (dl-program + "parent(a, b). parent(a, c). parent(b, d). parent(c, e). + ancestor(X, Y) :- parent(X, Y). + ancestor(X, Z) :- parent(X, Y), ancestor(Y, Z). + num_ancestors(N) :- count(N, X, ancestor(a, X)).") + (list (quote num_ancestors) (quote N))) + (list {:N 4})) + + ;; count with no matches → 0. + (dl-at-test-set! "count empty" + (dl-query + (dl-program + "p(1). p(2). + zero(N) :- count(N, X, q(X)).") + (list (quote zero) (quote N))) + (list {:N 0})) + + ;; sum with no matches → 0. + (dl-at-test-set! "sum empty" + (dl-query + (dl-program + "p(1). p(2). + total(T) :- sum(T, X, q(X)).") + (list (quote total) (quote T))) + (list {:T 0})) + + ;; min with no matches → rule does not fire. + (dl-at-test-set! "min empty" + (dl-query + (dl-program + "p(1). p(2). + lo(M) :- min(M, X, q(X)).") + (list (quote lo) (quote M))) + (list)) + + ;; Aggregate with comparison filter on result. + (dl-at-test-set! "popularity threshold" + (dl-query + (dl-program + "post(p1). post(p2). + liked(u1, p1). liked(u2, p1). liked(u3, p1). + liked(u1, p2). liked(u2, p2). + popular(P) :- post(P), count(N, U, liked(U, P)), >=(N, 3).") + (list (quote popular) (quote P))) + (list {:P (quote p1)})) + + ;; Aggregate vs single distinct. + (dl-at-test-set! "distinct counted once" + (dl-query + (dl-program + "rated(alice, x). rated(alice, y). rated(bob, x). + rater_count(N) :- count(N, U, rated(U, F)).") + (list (quote rater_count) (quote N))) + (list {:N 2}))))) + +(define + dl-aggregates-tests-run! + (fn + () + (do + (set! dl-at-pass 0) + (set! dl-at-fail 0) + (set! dl-at-failures (list)) + (dl-at-run-all!) + {:passed dl-at-pass + :failed dl-at-fail + :total (+ dl-at-pass dl-at-fail) + :failures dl-at-failures}))) diff --git a/plans/datalog-on-sx.md b/plans/datalog-on-sx.md index cc92ddbf..291cd16d 100644 --- a/plans/datalog-on-sx.md +++ b/plans/datalog-on-sx.md @@ -185,12 +185,27 @@ large graphs. rejection, and a negation safety violation. ### Phase 8 — aggregation (Datalog+) -- [ ] `count(X, Goal)` → number of distinct X satisfying Goal -- [ ] `sum(X, Goal)` → sum of X values satisfying Goal -- [ ] `min(X, Goal)` / `max(X, Goal)` → min/max of X satisfying Goal -- [ ] `group-by` semantics: `count(X, sibling(bob, X))` → count of bob's siblings -- [ ] Aggregation breaks stratification — evaluate in a separate post-fixpoint pass -- [ ] Tests: social network statistics, grade aggregation, inventory sums +- [x] `(count R V Goal)`, `(sum R V Goal)`, `(min R V Goal)`, + `(max R V Goal)` — first arg is the result variable, second is the + aggregated variable, third is the goal literal. Live in + `lib/datalog/aggregates.sx`. +- [x] `dl-eval-aggregate`: runs `dl-find-bindings` on the goal under the + current subst (which provides outer-context bindings), collects + distinct values of the aggregated var, applies the aggregate. + `count`/`sum` produce 0 when no matches; `min`/`max` produce no + binding (rule fails) when empty. +- [x] Group-by emerges naturally: outer-context vars in the goal are + substituted from the current subst, so `popular(P) :- post(P), + count(N, U, liked(U, P)), >=(N, 3).` correctly counts per-post. +- [x] Stratification: `dl-aggregate-dep-edge` returns a negation-like + edge so the aggregate's goal relation is fully derived before the + aggregate fires. Non-monotonicity respected. +- [x] Safety: aggregate body lit binds the result var; goal-internal + vars are existentially quantified and don't need outer binding. +- [x] Tests in `lib/datalog/tests/aggregates.sx` (10): count siblings, + sum prices, min/max scores, count over derived relation, + empty-input cases for each operator, popularity threshold with + group-by, distinct-counted-once. ### Phase 9 — SX embedding API - [ ] `(dl-program facts rules)` → database from SX data directly (no parsing required) @@ -230,6 +245,19 @@ large graphs. _Newest first._ +- 2026-05-08 — Phase 8 done. New `lib/datalog/aggregates.sx` (~110 + LOC): count / sum / min / max. Each is a body literal of shape + `(op R V Goal)` — `dl-eval-aggregate` runs `dl-find-bindings` on + the goal under the outer subst (so outer vars in the goal get + substituted, giving group-by-style aggregation), collects the + distinct values of `V`, and binds `R`. Empty input: count/sum + return 0; min/max produce no binding (rule fails). Stratifier + extended via `dl-aggregate-dep-edge` so the aggregate's goal + relation is fully derived before the aggregate fires. Safety check + treats goal-internal vars as existentials (no outer binding + required); only the result var becomes bound. Conformance now + 134 / 134. + - 2026-05-08 — Phase 7 done (Phase 6 magic sets deferred — opt-in, semi-naive default suffices for current test suite). New `lib/datalog/strata.sx` (~290 LOC): dep graph build, Floyd-Warshall