From a979297959117da42c6b2499cbf58dcddd82615b Mon Sep 17 00:00:00 2001 From: giles Date: Sat, 6 Jun 2026 16:50:36 +0000 Subject: [PATCH] =?UTF-8?q?feed:=20TF-IDF=20content=20ranking=20over=20:ta?= =?UTF-8?q?gs=20=E2=80=94=20tag-df/idf,=20tfidf-score,=20by-relevance=20+?= =?UTF-8?q?=2015=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.8 (1M context) --- lib/feed/conformance.sh | 3 +- lib/feed/content.sx | 68 +++++++++++++++++++++++++++++++ lib/feed/scoreboard.json | 7 ++-- lib/feed/scoreboard.md | 3 +- lib/feed/tests/content.sx | 85 +++++++++++++++++++++++++++++++++++++++ plans/feed-on-sx.md | 6 ++- 6 files changed, 165 insertions(+), 7 deletions(-) create mode 100644 lib/feed/content.sx create mode 100644 lib/feed/tests/content.sx diff --git a/lib/feed/conformance.sh b/lib/feed/conformance.sh index d4a37cd5..95676dd2 100755 --- a/lib/feed/conformance.sh +++ b/lib/feed/conformance.sh @@ -13,7 +13,7 @@ if [ ! -x "$SX_SERVER" ]; then exit 1 fi -SUITES=(basic fanout rank integration) +SUITES=(basic fanout rank integration content) OUT_JSON="lib/feed/scoreboard.json" OUT_MD="lib/feed/scoreboard.md" @@ -37,6 +37,7 @@ run_suite() { (load "lib/feed/rank.sx") (load "lib/feed/acl.sx") (load "lib/feed/fed.sx") +(load "lib/feed/content.sx") (epoch 2) (eval "(define feed-test-pass 0)") (eval "(define feed-test-fail 0)") diff --git a/lib/feed/content.sx b/lib/feed/content.sx new file mode 100644 index 00000000..f15e8434 --- /dev/null +++ b/lib/feed/content.sx @@ -0,0 +1,68 @@ +; feed/content — TF-IDF relevance over activity :tags. Rare tags carry more +; signal, so an activity matching an uncommon tag ranks above one matching a +; common tag. Composes with rank.sx: feed/tfidf-score is just another scorer. +; +; Requires: lib/feed/normalize.sx, lib/feed/stream.sx, lib/feed/fanout.sx +; (feed/-distinct), lib/feed/rank.sx (feed/rank). + +; document frequency: tag -> number of activities whose :tags contain it +; (a tag repeated within one activity counts once toward df) +(define + feed/tag-df + (fn + (stream) + (reduce + (fn + (df a) + (reduce + (fn (d t) (assoc d t (+ (get d t 0) 1))) + df + (feed/-distinct (get a :tags)))) + {} + (feed/items stream)))) + +; inverse document frequency: tag -> log(N / df) +(define + feed/tag-idf + (fn + (stream) + (let + ((n (feed/count stream)) (df (feed/tag-df stream))) + (reduce + (fn (idf t) (assoc idf t (log (/ n (get df t))))) + {} + (keys df))))) + +; term frequency within one activity: tag -> occurrence count +(define + feed/-tf + (fn + (a) + (reduce + (fn (tf t) (assoc tf t (+ (get tf t 0) 1))) + {} + (get a :tags)))) + +; relevance of an activity to a query (list of tags) given precomputed idf: +; sum over query tags of tf(tag in activity) * idf(tag in corpus) +(define + feed/tfidf-score + (fn + (idf query) + (fn + (a) + (let + ((tf (feed/-tf a))) + (reduce + (fn + (acc t) + (+ acc (* (get tf t 0) (get idf t 0)))) + 0 + query))))) + +; rank a stream by relevance to query tags (idf computed over the stream itself) +(define + feed/by-relevance + (fn + (stream query) + (feed/rank stream (feed/tfidf-score (feed/tag-idf stream) query)))) diff --git a/lib/feed/scoreboard.json b/lib/feed/scoreboard.json index ac5682db..b8d206dc 100644 --- a/lib/feed/scoreboard.json +++ b/lib/feed/scoreboard.json @@ -3,9 +3,10 @@ "basic": {"pass": 30, "fail": 0}, "fanout": {"pass": 29, "fail": 0}, "rank": {"pass": 24, "fail": 0}, - "integration": {"pass": 22, "fail": 0} + "integration": {"pass": 22, "fail": 0}, + "content": {"pass": 15, "fail": 0} }, - "total_pass": 105, + "total_pass": 120, "total_fail": 0, - "total": 105 + "total": 120 } diff --git a/lib/feed/scoreboard.md b/lib/feed/scoreboard.md index dba890ec..beafe516 100644 --- a/lib/feed/scoreboard.md +++ b/lib/feed/scoreboard.md @@ -8,4 +8,5 @@ _Generated by `lib/feed/conformance.sh`_ | fanout | 29 | 0 | 29 | | rank | 24 | 0 | 24 | | integration | 22 | 0 | 22 | -| **Total** | **105** | **0** | **105** | +| content | 15 | 0 | 15 | +| **Total** | **120** | **0** | **120** | diff --git a/lib/feed/tests/content.sx b/lib/feed/tests/content.sx new file mode 100644 index 00000000..dd742adf --- /dev/null +++ b/lib/feed/tests/content.sx @@ -0,0 +1,85 @@ +; Follow-up — TF-IDF content ranking over :tags. (feed-test name got expected) + +(define + corpus + (feed/stream + (list + (feed/normalize {:actor "u" :object "o1" :at 10 :tags (list "cats" "funny")}) + (feed/normalize {:actor "u" :object "o2" :at 20 :tags (list "cats" "news")}) + (feed/normalize {:actor "u" :object "o3" :at 30 :tags (list "politics" "news")}) + (feed/normalize {:actor "u" :object "o4" :at 40 :tags (list "cats")})))) + +; ---------- document frequency ---------- + +(feed-test "df cats" (get (feed/tag-df corpus) "cats") 3) +(feed-test "df news" (get (feed/tag-df corpus) "news") 2) +(feed-test "df funny" (get (feed/tag-df corpus) "funny") 1) +(feed-test "df politics" (get (feed/tag-df corpus) "politics") 1) +(feed-test "df full" (feed/tag-df corpus) {:news 2 :funny 1 :politics 1 :cats 3}) + +; ---------- inverse document frequency ---------- + +(feed-test + "idf news = log(4/2)" + (get (feed/tag-idf corpus) "news") + (log 2)) +(feed-test + "idf funny = log(4/1)" + (get (feed/tag-idf corpus) "funny") + (log 4)) +(feed-test + "rarer tag has higher idf" + (> + (get (feed/tag-idf corpus) "funny") + (get (feed/tag-idf corpus) "cats")) + true) + +; ---------- tf-idf scoring ---------- + +(define idf (feed/tag-idf corpus)) + +(feed-test + "score query funny on o1" + ((feed/tfidf-score idf (list "funny")) (feed/normalize {:actor "u" :object "x" :tags (list "cats" "funny")})) + (log 4)) +(feed-test + "score query funny on non-match" + ((feed/tfidf-score idf (list "funny")) (feed/normalize {:actor "u" :object "x" :tags (list "cats")})) + 0) +(feed-test + "unknown query tag scores 0" + ((feed/tfidf-score idf (list "zzz")) (feed/normalize {:actor "u" :object "x" :tags (list "cats")})) + 0) + +; ---------- ranking by relevance ---------- + +; query news: o2,o3 match (score log2), o1,o4 don't (0); ties break by :at desc +(feed-test + "by-relevance news order" + (map + (fn (a) (get a :object)) + (feed/items (feed/by-relevance corpus (list "news")))) + (list "o3" "o2" "o4" "o1")) + +; query funny: only o1 matches -> ranks first +(feed-test + "by-relevance funny first" + (get + (nth (feed/items (feed/by-relevance corpus (list "funny"))) 0) + :object) + "o1") + +; query (cats news): o2 carries both tags -> highest combined tf-idf +(feed-test + "by-relevance cats+news top" + (get + (nth + (feed/items (feed/by-relevance corpus (list "cats" "news"))) + 0) + :object) + "o2") + +(feed-test + "by-relevance preserves count" + (feed/count (feed/by-relevance corpus (list "cats"))) + 4) diff --git a/plans/feed-on-sx.md b/plans/feed-on-sx.md index 860c2427..70b21e51 100644 --- a/plans/feed-on-sx.md +++ b/plans/feed-on-sx.md @@ -14,7 +14,7 @@ APL, ACL visibility filtering via `lib/acl/`, federation via fed-sx. ## Status (rolling) -`bash lib/feed/conformance.sh` → **105/105** (Phases 1–4 complete) +`bash lib/feed/conformance.sh` → **120/120** (Phases 1–4 + TF-IDF complete) ## Ground rules @@ -138,7 +138,9 @@ are function parameters. Real acl-sx / fed-sx wire in at the call site unchanged - Wire real acl-sx once `lib/acl/` exists (swap injected `permit?`). - Wire real fed-sx transport (swap `send-fn`/`fetch-fn`). -- TF-IDF over `:tags` for content ranking (sketch mentions it; not yet built). +- [x] TF-IDF over `:tags` for content ranking — `content.sx`: `feed/tag-df`, + `feed/tag-idf` (log N/df), `feed/tfidf-score`, `feed/by-relevance`; 15 tests. + Composes as a scorer with rank.sx. (120/120 total.) - Notification feed (verb-filtered, per-recipient) as a thin layer over fanout. (none)