lib/gitea/search.sx: the forge builds document corpora SX-side — code files from the default branch head (path + blob text), issues (title + body + comments), PRs (title + body + reviews) — embeds them as one haskell-on-sx program and asks searchRankTfIdf for ranked doc ids (terms, AND/OR/NOT, phrases). Cost model honored: one evaluation parses the Haskell layers (~20s CPU), extra queries are nearly free — so the core primitive is gitea/search-multi (any number of corpora and queries in a single evaluation; each corpus an idxN binding) and only the six layers searchRankTfIdf needs are compiled, not the full search/src. The test suite runs its thirteen SX-level queries over five corpora as ONE evaluation. Global search spans exactly the repos the caller can read. Web: /:owner/:name/search page (kind filter), repo + global JSON search. Suite timeout raised to 900s for the haskell-backed suites. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
267 lines
7.9 KiB
Plaintext
267 lines
7.9 KiB
Plaintext
; lib/gitea/tests/search.sx — Phase 7: code + issue/PR search over
|
|
; search-on-sx. All SX-level queries run as ONE haskell evaluation
|
|
; (gitea/search-multi over five corpora); the web tests add one
|
|
; evaluation per request.
|
|
|
|
(st-bootstrap-classes!)
|
|
(content/bootstrap!)
|
|
(content-bootstrap-markdown!)
|
|
(content-bootstrap-table!)
|
|
|
|
(define gitea-search-pass 0)
|
|
(define gitea-search-fail 0)
|
|
(define gitea-search-fails (list))
|
|
|
|
(define
|
|
gitea-search-test
|
|
(fn
|
|
(name actual expected)
|
|
(if
|
|
(= actual expected)
|
|
(set! gitea-search-pass (+ gitea-search-pass 1))
|
|
(begin
|
|
(set! gitea-search-fail (+ gitea-search-fail 1))
|
|
(set!
|
|
gitea-search-fails
|
|
(append gitea-search-fails (list {:name name :expected (inspect expected) :actual (inspect actual)})))))))
|
|
|
|
; ── escaping ─────────────────────────────────────────────────────────
|
|
|
|
(gitea-search-test "hk-escape quotes" (gitea/hk-escape "a\"b") "a\\\"b")
|
|
(gitea-search-test "hk-escape newline" (gitea/hk-escape "a\nb") "a\\nb")
|
|
(gitea-search-test "hk-escape backslash" (gitea/hk-escape "a\\b") "a\\\\b")
|
|
(gitea-search-test "hk-escape nil" (gitea/hk-escape nil) "")
|
|
|
|
; ── setup ────────────────────────────────────────────────────────────
|
|
|
|
(define gs-db (persist/mem-backend))
|
|
(define gs-forge (gitea/forge gs-db))
|
|
(gitea/user-create! gs-forge "alice")
|
|
(gitea/user-create! gs-forge "bob")
|
|
(gitea/token-create! gs-forge "alice" "tok-a")
|
|
(gitea/repo-create! gs-forge "alice" "proj" {})
|
|
(gitea/repo-create! gs-forge "alice" "sec" {:visibility "private"})
|
|
(gitea/repo-create! gs-forge "alice" "empty" {})
|
|
|
|
(define gs-g (gitea/repo-git gs-forge "alice" "proj"))
|
|
(git/add! gs-g "README.md" "SX forge documentation overview")
|
|
(git/add! gs-g "src/main.sx" "(define forge-main entry) banana runtime")
|
|
(git/add! gs-g "docs/guide.md" "how to search the forge")
|
|
(git/commit! gs-g {:message "c1" :time 1 :author "alice"})
|
|
(git/branch! gs-g "feat")
|
|
(git/checkout! gs-g "feat")
|
|
(git/add! gs-g "src/extra.sx" "cherry helpers")
|
|
(git/commit! gs-g {:message "c2" :time 2 :author "alice"})
|
|
(git/checkout! gs-g "main")
|
|
|
|
(define gs-gs (gitea/repo-git gs-forge "alice" "sec"))
|
|
(git/add! gs-gs "secret.txt" "banana secret stash")
|
|
(git/commit! gs-gs {:message "s1" :time 3 :author "alice"})
|
|
|
|
(gitea/issue-create!
|
|
gs-forge
|
|
"alice"
|
|
"proj"
|
|
"alice"
|
|
"Crash in search"
|
|
"the search crashes on banana input"
|
|
{:created-at 10})
|
|
(gitea/issue-comment!
|
|
gs-forge
|
|
"alice"
|
|
"proj"
|
|
1
|
|
"bob"
|
|
"reproduced with cherry too"
|
|
{:at 11})
|
|
(gitea/issue-create!
|
|
gs-forge
|
|
"alice"
|
|
"proj"
|
|
"alice"
|
|
"Docs update"
|
|
"documentation needs love"
|
|
{:created-at 12})
|
|
(gitea/pr-create!
|
|
gs-forge
|
|
"alice"
|
|
"proj"
|
|
"bob"
|
|
"Improve search ranking"
|
|
"feat"
|
|
"main"
|
|
"tfidf ranking for results"
|
|
{})
|
|
|
|
; ── corpus construction ──────────────────────────────────────────────
|
|
|
|
(define gs-corp-code (gitea/repo-docs gs-forge "alice" "proj" (list "code")))
|
|
(define
|
|
gs-corp-ip
|
|
(gitea/repo-docs gs-forge "alice" "proj" (list "issue" "pr")))
|
|
(define
|
|
gs-corp-all
|
|
(gitea/repo-docs gs-forge "alice" "proj" gitea/search-kinds-all))
|
|
(define gs-corp-anon (gitea/visible-docs gs-forge nil gitea/search-kinds-all))
|
|
(define
|
|
gs-corp-own
|
|
(gitea/visible-docs gs-forge "alice" gitea/search-kinds-all))
|
|
|
|
(gitea-search-test "code docs" (len gs-corp-code) 3)
|
|
(gitea-search-test "issue+pr docs" (len gs-corp-ip) 3)
|
|
(gitea-search-test "all docs" (len gs-corp-all) 6)
|
|
(gitea-search-test
|
|
"empty repo corpus"
|
|
(gitea/code-docs gs-forge "alice" "empty")
|
|
(list))
|
|
(gitea-search-test
|
|
"anon corpus excludes private"
|
|
(len (filter (fn (d) (= (get d :repo) "alice/sec")) gs-corp-anon))
|
|
0)
|
|
(gitea-search-test
|
|
"owner corpus includes private"
|
|
(len (filter (fn (d) (= (get d :repo) "alice/sec")) gs-corp-own))
|
|
1)
|
|
|
|
; ── ONE evaluation, thirteen queries over five corpora ───────────────
|
|
|
|
(define
|
|
gs-R
|
|
(gitea/search-multi
|
|
(list gs-corp-code gs-corp-ip gs-corp-all gs-corp-anon gs-corp-own)
|
|
(list
|
|
{:n 20 :query "banana" :corpus 0}
|
|
{:n 20 :query "forge" :corpus 0}
|
|
{:n 20 :query "banana OR cherry" :corpus 0}
|
|
{:n 20 :query "nosuchterm" :corpus 0}
|
|
{:n 20 :query "banana" :corpus 1}
|
|
{:n 20 :query "cherry" :corpus 1}
|
|
{:n 20 :query "ranking" :corpus 1}
|
|
{:n 20 :query "search AND banana" :corpus 1}
|
|
{:n 20 :query "documentation" :corpus 1}
|
|
{:n 20 :query "search" :corpus 2}
|
|
{:n 2 :query "search" :corpus 2}
|
|
{:n 20 :query "banana" :corpus 3}
|
|
{:n 20 :query "banana" :corpus 4})))
|
|
|
|
(gitea-search-test
|
|
"code: unique term"
|
|
(nth gs-R 0)
|
|
(list {:kind "code" :ref "src/main.sx"}))
|
|
(gitea-search-test
|
|
"code: common term hits all three"
|
|
(len (nth gs-R 1))
|
|
3)
|
|
(gitea-search-test "code: OR query" (nth gs-R 2) (list {:kind "code" :ref "src/main.sx"}))
|
|
(gitea-search-test "code: no hits" (nth gs-R 3) (list))
|
|
|
|
(gitea-search-test
|
|
"issues: body term"
|
|
(nth gs-R 4)
|
|
(list {:kind "issue" :ref "1"}))
|
|
(gitea-search-test
|
|
"issues: comment text indexed"
|
|
(nth gs-R 5)
|
|
(list {:kind "issue" :ref "1"}))
|
|
(gitea-search-test "prs: body term" (nth gs-R 6) (list {:kind "pr" :ref "3"}))
|
|
(gitea-search-test
|
|
"issues: AND query"
|
|
(nth gs-R 7)
|
|
(list {:kind "issue" :ref "1"}))
|
|
(gitea-search-test
|
|
"issues: title+body of second"
|
|
(len (filter (fn (r) (= r {:kind "issue" :ref "2"})) (nth gs-R 8)))
|
|
1)
|
|
|
|
(gitea-search-test
|
|
"mixed kinds found"
|
|
(len (nth gs-R 9))
|
|
3)
|
|
(gitea-search-test
|
|
"take-n limits results"
|
|
(len (nth gs-R 10))
|
|
2)
|
|
|
|
(gitea-search-test
|
|
"global anon excludes private"
|
|
(len
|
|
(filter (fn (r) (= (get r :repo) "alice/sec")) (nth gs-R 11)))
|
|
0)
|
|
(gitea-search-test
|
|
"global anon finds public code+issue"
|
|
(len (nth gs-R 11))
|
|
2)
|
|
(gitea-search-test
|
|
"global owner sees private"
|
|
(len
|
|
(filter (fn (r) (= (get r :repo) "alice/sec")) (nth gs-R 12)))
|
|
1)
|
|
(gitea-search-test
|
|
"global owner total"
|
|
(len (nth gs-R 12))
|
|
3)
|
|
|
|
; empty corpora short-circuit without an evaluation
|
|
(gitea-search-test
|
|
"empty corpus searches empty"
|
|
(gitea/search-repo
|
|
gs-forge
|
|
"alice"
|
|
"empty"
|
|
"banana"
|
|
gitea/search-kinds-all
|
|
10)
|
|
(list))
|
|
|
|
; ── web (one haskell evaluation per request) ─────────────────────────
|
|
|
|
(define gs-app (gitea/app gs-forge))
|
|
(define gs-hdr (fn (tok) (if (nil? tok) {} {:authorization (str "Bearer " tok)})))
|
|
(define
|
|
gs-get
|
|
(fn (target tok) (gs-app (dream-request "GET" target (gs-hdr tok) ""))))
|
|
|
|
(define gs-page (gs-get "/alice/proj/search?q=banana" nil))
|
|
(gitea-search-test "search page 200" (dream-status gs-page) 200)
|
|
(gitea-search-test
|
|
"search page links the hit"
|
|
(contains? (dream-resp-body gs-page) "src/main.sx")
|
|
true)
|
|
|
|
(gitea-search-test
|
|
"search page kind filter"
|
|
(contains?
|
|
(dream-resp-body (gs-get "/alice/proj/search?q=banana&kind=issue" nil))
|
|
"issues/1")
|
|
true)
|
|
(gitea-search-test
|
|
"search page missing q 400"
|
|
(dream-status (gs-get "/alice/proj/search" nil))
|
|
400)
|
|
(gitea-search-test
|
|
"private search anon 404"
|
|
(dream-status (gs-get "/alice/sec/search?q=banana" nil))
|
|
404)
|
|
|
|
(gitea-search-test
|
|
"api repo search"
|
|
(dream-json-parse
|
|
(dream-resp-body
|
|
(gs-get "/api/repos/alice/proj/search?q=cherry&kind=issue" nil)))
|
|
(list {:kind "issue" :ref "1"}))
|
|
(gitea-search-test
|
|
"api repo search missing q 400"
|
|
(dream-status (gs-get "/api/repos/alice/proj/search" nil))
|
|
400)
|
|
|
|
(define gs-glob-anon (gs-get "/api/search?q=banana" nil))
|
|
(gitea-search-test
|
|
"api global anon count"
|
|
(len (dream-json-parse (dream-resp-body gs-glob-anon)))
|
|
2)
|
|
(define gs-glob-own (gs-get "/api/search?q=banana" "tok-a"))
|
|
(gitea-search-test
|
|
"api global authed count"
|
|
(len (dream-json-parse (dream-resp-body gs-glob-own)))
|
|
3)
|