Files
rose-ash/lib/feed/tests/content.sx
giles a979297959
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 43s
feed: TF-IDF content ranking over :tags — tag-df/idf, tfidf-score, by-relevance + 15 tests
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-06 16:50:36 +00:00

86 lines
2.5 KiB
Plaintext

; Follow-up — TF-IDF content ranking over :tags. (feed-test name got expected)
(define
corpus
(feed/stream
(list
(feed/normalize {:actor "u" :object "o1" :at 10 :tags (list "cats" "funny")})
(feed/normalize {:actor "u" :object "o2" :at 20 :tags (list "cats" "news")})
(feed/normalize {:actor "u" :object "o3" :at 30 :tags (list "politics" "news")})
(feed/normalize {:actor "u" :object "o4" :at 40 :tags (list "cats")}))))
; ---------- document frequency ----------
(feed-test "df cats" (get (feed/tag-df corpus) "cats") 3)
(feed-test "df news" (get (feed/tag-df corpus) "news") 2)
(feed-test "df funny" (get (feed/tag-df corpus) "funny") 1)
(feed-test "df politics" (get (feed/tag-df corpus) "politics") 1)
(feed-test "df full" (feed/tag-df corpus) {:news 2 :funny 1 :politics 1 :cats 3})
; ---------- inverse document frequency ----------
(feed-test
"idf news = log(4/2)"
(get (feed/tag-idf corpus) "news")
(log 2))
(feed-test
"idf funny = log(4/1)"
(get (feed/tag-idf corpus) "funny")
(log 4))
(feed-test
"rarer tag has higher idf"
(>
(get (feed/tag-idf corpus) "funny")
(get (feed/tag-idf corpus) "cats"))
true)
; ---------- tf-idf scoring ----------
(define idf (feed/tag-idf corpus))
(feed-test
"score query funny on o1"
((feed/tfidf-score idf (list "funny")) (feed/normalize {:actor "u" :object "x" :tags (list "cats" "funny")}))
(log 4))
(feed-test
"score query funny on non-match"
((feed/tfidf-score idf (list "funny")) (feed/normalize {:actor "u" :object "x" :tags (list "cats")}))
0)
(feed-test
"unknown query tag scores 0"
((feed/tfidf-score idf (list "zzz")) (feed/normalize {:actor "u" :object "x" :tags (list "cats")}))
0)
; ---------- ranking by relevance ----------
; query news: o2,o3 match (score log2), o1,o4 don't (0); ties break by :at desc
(feed-test
"by-relevance news order"
(map
(fn (a) (get a :object))
(feed/items (feed/by-relevance corpus (list "news"))))
(list "o3" "o2" "o4" "o1"))
; query funny: only o1 matches -> ranks first
(feed-test
"by-relevance funny first"
(get
(nth (feed/items (feed/by-relevance corpus (list "funny"))) 0)
:object)
"o1")
; query (cats news): o2 carries both tags -> highest combined tf-idf
(feed-test
"by-relevance cats+news top"
(get
(nth
(feed/items (feed/by-relevance corpus (list "cats" "news")))
0)
:object)
"o2")
(feed-test
"by-relevance preserves count"
(feed/count (feed/by-relevance corpus (list "cats")))
4)