lib/gitea/search.sx: the forge builds document corpora SX-side — code files from the default branch head (path + blob text), issues (title + body + comments), PRs (title + body + reviews) — embeds them as one haskell-on-sx program and asks searchRankTfIdf for ranked doc ids (terms, AND/OR/NOT, phrases). Cost model honored: one evaluation parses the Haskell layers (~20s CPU), extra queries are nearly free — so the core primitive is gitea/search-multi (any number of corpora and queries in a single evaluation; each corpus an idxN binding) and only the six layers searchRankTfIdf needs are compiled, not the full search/src. The test suite runs its thirteen SX-level queries over five corpora as ONE evaluation. Global search spans exactly the repos the caller can read. Web: /:owner/:name/search page (kind filter), repo + global JSON search. Suite timeout raised to 900s for the haskell-backed suites. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
384 lines
10 KiB
Plaintext
384 lines
10 KiB
Plaintext
; lib/gitea/search.sx — sx-gitea Phase 7: code + issue/PR search.
|
|
;
|
|
; search-on-sx (haskell-on-sx) does the heavy lifting: the forge builds a
|
|
; document corpus SX-side — code files from the default branch head
|
|
; (path + blob text), issues (title + body + comment bodies), PRs (title
|
|
; + body + review bodies) — embeds it as one Haskell program, and asks
|
|
; searchRankTfIdf for ranked doc ids. Queries speak the search query
|
|
; language (terms, AND/OR/NOT, "phrases").
|
|
;
|
|
; Cost model: ONE evaluation parses the Haskell search layers (~seconds),
|
|
; extra queries are nearly free. So the core primitive is
|
|
; gitea/search-multi — ANY number of corpora and queries in a single
|
|
; evaluation (each corpus becomes an idxN binding) — and everything else
|
|
; sugars over it. Only the six layers searchRankTfIdf needs are compiled
|
|
; (tokenize/index/query/parse/rank/rankq), not the full search/src.
|
|
;
|
|
; Requires: lib/gitea/{repo,access,web,issues,pr}.sx and their stacks,
|
|
; lib/haskell/* + lib/search/{tokenize,index,query,parse,rank,rankq,
|
|
; testlib}.sx (search-hk->sx, hk-core, hk-eval-program).
|
|
|
|
; ── corpus ───────────────────────────────────────────────────────────
|
|
|
|
(define
|
|
gitea/code-docs
|
|
(fn
|
|
(forge owner name)
|
|
(let
|
|
((rec (gitea/repo-get forge owner name)))
|
|
(if
|
|
(nil? rec)
|
|
(list)
|
|
(let
|
|
((grepo (gitea/repo-git forge owner name)))
|
|
(let
|
|
((head-cid (git/branch-get grepo (get rec :default-branch))))
|
|
(if
|
|
(nil? head-cid)
|
|
(list)
|
|
(let
|
|
((files (git/commit-files grepo head-cid)))
|
|
(map
|
|
(fn (p) {:text (str p " " (get files p)) :kind "code" :ref p})
|
|
(artdag/sort-strings (keys files)))))))))))
|
|
|
|
(define
|
|
gitea/issue-docs
|
|
(fn
|
|
(forge owner name)
|
|
(map (fn (r) {:text (str (get r :title) " " (get r :body) " " (join " " (map (fn (c) (get c :body)) (get r :comments)))) :kind "issue" :ref (str (get r :number))}) (gitea/issue-records forge owner name))))
|
|
|
|
(define
|
|
gitea/pr-search-docs
|
|
(fn
|
|
(forge owner name)
|
|
(map (fn (r) {:text (str (get r :title) " " (get r :body) " " (join " " (map (fn (v) (get v :body)) (get r :reviews)))) :kind "pr" :ref (str (get r :number))}) (gitea/pr-records forge owner name))))
|
|
|
|
(define gitea/search-kinds-all (list "code" "issue" "pr"))
|
|
|
|
(define
|
|
gitea/repo-docs
|
|
(fn
|
|
(forge owner name kinds)
|
|
(filter
|
|
(fn (d) (contains? kinds (get d :kind)))
|
|
(concat
|
|
(gitea/code-docs forge owner name)
|
|
(concat
|
|
(gitea/issue-docs forge owner name)
|
|
(gitea/pr-search-docs forge owner name))))))
|
|
|
|
(define
|
|
gitea/visible-docs
|
|
(fn
|
|
(forge user kinds)
|
|
(reduce
|
|
(fn
|
|
(acc full)
|
|
(let
|
|
((p (gitea/split-full full)))
|
|
(concat
|
|
acc
|
|
(map
|
|
(fn (d) (assoc d :repo full))
|
|
(gitea/repo-docs forge (get p :owner) (get p :name) kinds)))))
|
|
(list)
|
|
(gitea/visible-repos forge user))))
|
|
|
|
; ── haskell program assembly ─────────────────────────────────────────
|
|
|
|
; only the layers searchRankTfIdf needs — parsing is the dominant cost
|
|
(define
|
|
gitea/search-src
|
|
(str
|
|
search/tokenize-src
|
|
"\n"
|
|
search/index-src
|
|
"\n"
|
|
search/query-src
|
|
"\n"
|
|
search/parse-src
|
|
"\n"
|
|
search/rank-src
|
|
"\n"
|
|
search/rankq-src
|
|
"\n"))
|
|
|
|
(define
|
|
gitea/hk-escape
|
|
(fn
|
|
(s)
|
|
(replace
|
|
(replace (replace (replace (or s "") "\\" "\\\\") "\"" "\\\"") "\n" "\\n")
|
|
"\t"
|
|
"\\t")))
|
|
|
|
; <name> = indexDoc k "text_k" ( ... (indexDoc 1 "text_1" emptyIndex))
|
|
(define
|
|
gitea/index-binding
|
|
(fn
|
|
(idxname docs)
|
|
(str
|
|
idxname
|
|
" = "
|
|
(reduce
|
|
(fn
|
|
(acc pair)
|
|
(str
|
|
"indexDoc "
|
|
(first pair)
|
|
" \""
|
|
(gitea/hk-escape (get (nth pair 1) :text))
|
|
"\" ("
|
|
acc
|
|
")"))
|
|
"emptyIndex"
|
|
(map-indexed (fn (i d) (list (+ i 1) d)) docs))
|
|
"\n")))
|
|
|
|
(define
|
|
gitea/hk-search-eval
|
|
(fn
|
|
(extra)
|
|
(search-hk->sx
|
|
(hk-deep-force
|
|
(get
|
|
(hk-eval-program (hk-core (str gitea/search-src extra)))
|
|
"result")))))
|
|
|
|
; THE core: corpora = (docs ...), specs = ({:corpus i :query q :n n} ...).
|
|
; One haskell evaluation; => one ranked {:kind :ref (:repo)} list per spec.
|
|
(define
|
|
gitea/search-multi
|
|
(fn
|
|
(corpora specs)
|
|
(let
|
|
((bindings (join "" (map-indexed (fn (i docs) (gitea/index-binding (str "idx" i) docs)) corpora)))
|
|
(exprs
|
|
(map
|
|
(fn
|
|
(s)
|
|
(if
|
|
(empty? (nth corpora (get s :corpus)))
|
|
"[]"
|
|
(str
|
|
"take "
|
|
(get s :n)
|
|
" (searchRankTfIdf \""
|
|
(gitea/hk-escape (get s :query))
|
|
"\" idx"
|
|
(get s :corpus)
|
|
")")))
|
|
specs)))
|
|
(let
|
|
((id-lists (gitea/hk-search-eval (str bindings "result = [" (join ", " exprs) "]\n"))))
|
|
(map-indexed
|
|
(fn
|
|
(si ids)
|
|
(let
|
|
((docs (nth corpora (get (nth specs si) :corpus))))
|
|
(map
|
|
(fn
|
|
(i)
|
|
(let
|
|
((d (nth docs (- i 1))))
|
|
(if (get d :repo) {:repo (get d :repo) :kind (get d :kind) :ref (get d :ref)} {:kind (get d :kind) :ref (get d :ref)})))
|
|
ids)))
|
|
id-lists)))))
|
|
|
|
; many queries against ONE corpus, still one evaluation
|
|
(define
|
|
gitea/search-many
|
|
(fn
|
|
(docs queries n)
|
|
(if
|
|
(empty? docs)
|
|
(map (fn (q) (list)) queries)
|
|
(gitea/search-multi (list docs) (map (fn (q) {:n n :query q :corpus 0}) queries)))))
|
|
|
|
; ── repo-scoped search ───────────────────────────────────────────────
|
|
|
|
(define
|
|
gitea/search-repo
|
|
(fn
|
|
(forge owner name query kinds n)
|
|
(first
|
|
(gitea/search-many
|
|
(gitea/repo-docs forge owner name kinds)
|
|
(list query)
|
|
n))))
|
|
|
|
(define
|
|
gitea/search-code
|
|
(fn
|
|
(forge owner name query n)
|
|
(gitea/search-repo forge owner name query (list "code") n)))
|
|
|
|
(define
|
|
gitea/search-issues
|
|
(fn
|
|
(forge owner name query n)
|
|
(gitea/search-repo forge owner name query (list "issue") n)))
|
|
|
|
(define
|
|
gitea/search-prs
|
|
(fn
|
|
(forge owner name query n)
|
|
(gitea/search-repo forge owner name query (list "pr") n)))
|
|
|
|
; ── global search over visible repos ─────────────────────────────────
|
|
|
|
(define
|
|
gitea/search-visible
|
|
(fn
|
|
(forge user query kinds n)
|
|
(first
|
|
(gitea/search-many
|
|
(gitea/visible-docs forge user kinds)
|
|
(list query)
|
|
n))))
|
|
|
|
; ── web ──────────────────────────────────────────────────────────────
|
|
|
|
(define gitea/w-query-param (fn (req name) (get (get req :query) name)))
|
|
|
|
(define
|
|
gitea/w-search-kinds
|
|
(fn
|
|
(req)
|
|
(let
|
|
((k (gitea/w-query-param req "kind")))
|
|
(if
|
|
(contains? gitea/search-kinds-all k)
|
|
(list k)
|
|
gitea/search-kinds-all))))
|
|
|
|
(define
|
|
gitea/w-search-item
|
|
(fn
|
|
(owner name r)
|
|
(let
|
|
((kind (get r :kind)) (ref (get r :ref)))
|
|
(cond
|
|
((= kind "code")
|
|
(str
|
|
"<li class=\"code\"><a href=\"/"
|
|
owner
|
|
"/"
|
|
name
|
|
"/blob/main/"
|
|
ref
|
|
"\">"
|
|
(dream-escape ref)
|
|
"</a></li>"))
|
|
((= kind "issue")
|
|
(str
|
|
"<li class=\"issue\"><a href=\"/"
|
|
owner
|
|
"/"
|
|
name
|
|
"/issues/"
|
|
ref
|
|
"\">#"
|
|
ref
|
|
"</a></li>"))
|
|
(else
|
|
(str
|
|
"<li class=\"pr\"><a href=\"/"
|
|
owner
|
|
"/"
|
|
name
|
|
"/pulls/"
|
|
ref
|
|
"\">#"
|
|
ref
|
|
"</a></li>"))))))
|
|
|
|
(define
|
|
gitea/w-search-page
|
|
(fn
|
|
(forge req)
|
|
(let
|
|
((owner (dream-param req "owner"))
|
|
(name (dream-param req "name"))
|
|
(q (gitea/w-query-param req "q")))
|
|
(cond
|
|
((not (gitea/w-readable? forge req owner name)) (dream-not-found))
|
|
((or (nil? q) (= q ""))
|
|
(gitea/w-json-status 400 {:error "missing-query"}))
|
|
(else
|
|
(gitea/w-page
|
|
(str "search: " q)
|
|
(str
|
|
"<h1>Search</h1><p>"
|
|
(dream-escape q)
|
|
"</p><ul>"
|
|
(join
|
|
""
|
|
(map
|
|
(fn (r) (gitea/w-search-item owner name r))
|
|
(gitea/search-repo
|
|
forge
|
|
owner
|
|
name
|
|
q
|
|
(gitea/w-search-kinds req)
|
|
20)))
|
|
"</ul>")))))))
|
|
|
|
(define
|
|
gitea/w-api-search-repo
|
|
(fn
|
|
(forge req)
|
|
(let
|
|
((owner (dream-param req "owner"))
|
|
(name (dream-param req "name"))
|
|
(q (gitea/w-query-param req "q")))
|
|
(cond
|
|
((not (gitea/w-readable? forge req owner name)) (dream-not-found))
|
|
((or (nil? q) (= q ""))
|
|
(gitea/w-json-status 400 {:error "missing-query"}))
|
|
(else
|
|
(dream-json-value
|
|
(gitea/search-repo
|
|
forge
|
|
owner
|
|
name
|
|
q
|
|
(gitea/w-search-kinds req)
|
|
20)))))))
|
|
|
|
(define
|
|
gitea/w-api-search
|
|
(fn
|
|
(forge req)
|
|
(let
|
|
((q (gitea/w-query-param req "q")))
|
|
(if
|
|
(or (nil? q) (= q ""))
|
|
(gitea/w-json-status 400 {:error "missing-query"})
|
|
(dream-json-value
|
|
(gitea/search-visible
|
|
forge
|
|
(gitea/w-user forge req)
|
|
q
|
|
(gitea/w-search-kinds req)
|
|
20))))))
|
|
|
|
(define
|
|
gitea/search-routes
|
|
(fn
|
|
(forge)
|
|
(list
|
|
(dream-get
|
|
"/:owner/:name/search"
|
|
(fn (req) (gitea/w-search-page forge req)))
|
|
(dream-get
|
|
"/api/repos/:owner/:name/search"
|
|
(fn (req) (gitea/w-api-search-repo forge req)))
|
|
(dream-get "/api/search" (fn (req) (gitea/w-api-search forge req))))))
|
|
|
|
(set!
|
|
gitea/route-packs
|
|
(append gitea/route-packs (list gitea/search-routes)))
|