Files
rose-ash/lib/haskell/tokenizer.sx
giles 99753580b4 Recover agent-loop progress: lua/prolog/forth/erlang/haskell phases 1-2
Salvaged from worktree-agent-* branches killed during sx-tree MCP outage:
- lua: tokenizer + parser + phase-2 transpile (~157 tests)
- prolog: tokenizer + parser + unification (72 tests, plan update lost to WIP)
- forth: phase-1 reader/interpreter + phase-2 colon/VARIABLE (134 tests)
- erlang: tokenizer + parser (114 tests)
- haskell: tokenizer + parse tests (43 tests)

Cherry-picked file contents only, not branch history, to avoid pulling in
unrelated ocaml-vm merge commits that were in those branches' bases.
2026-04-24 16:03:00 +00:00

629 lines
20 KiB
Plaintext

;; Haskell tokenizer — produces a token stream from Haskell 98 source.
;;
;; Tokens: {:type T :value V :line L :col C}
;;
;; Types:
;; "varid" lowercase ident, e.g. fmap, x, myFunc
;; "conid" uppercase ident, e.g. Nothing, Just, Map
;; "qvarid" qualified varid, value holds raw "A.B.foo"
;; "qconid" qualified conid, e.g. "Data.Map"
;; "reserved" reserved word — value is the word
;; "varsym" operator symbol, e.g. +, ++, >>=
;; "consym" constructor operator (starts with :), e.g. :, :+
;; "reservedop" reserved operator ("::", "=", "->", "<-", "=>", "|", "\\", "@", "~", "..")
;; "integer" integer literal (number)
;; "float" float literal (number)
;; "char" char literal (string of length 1)
;; "string" string literal
;; "lparen" "rparen" "lbracket" "rbracket" "lbrace" "rbrace"
;; "vlbrace" "vrbrace" "vsemi" virtual layout tokens (inserted later)
;; "comma" "semi" "backtick"
;; "newline" a logical line break (used by layout pass; stripped afterwards)
;; "eof"
;;
;; Note: SX `cond`/`when` clauses evaluate ONLY their last expression.
;; Multi-expression bodies must be wrapped in (do ...). All helpers use
;; the hk- prefix to avoid clashing with SX evaluator special forms.
;; ── Char-code table ───────────────────────────────────────────────
(define
hk-ord-table
(let
((t (dict)) (i 0))
(define
hk-build-table
(fn
()
(when
(< i 128)
(do
(dict-set! t (char-from-code i) i)
(set! i (+ i 1))
(hk-build-table)))))
(hk-build-table)
t))
(define hk-ord (fn (c) (or (get hk-ord-table c) 0)))
;; ── Character predicates ──────────────────────────────────────────
(define
hk-digit?
(fn (c) (and (string? c) (>= (hk-ord c) 48) (<= (hk-ord c) 57))))
(define
hk-hex-digit?
(fn
(c)
(and
(string? c)
(or
(and (>= (hk-ord c) 48) (<= (hk-ord c) 57))
(and (>= (hk-ord c) 97) (<= (hk-ord c) 102))
(and (>= (hk-ord c) 65) (<= (hk-ord c) 70))))))
(define
hk-octal-digit?
(fn (c) (and (string? c) (>= (hk-ord c) 48) (<= (hk-ord c) 55))))
(define
hk-lower?
(fn
(c)
(and
(string? c)
(or (and (>= (hk-ord c) 97) (<= (hk-ord c) 122)) (= c "_")))))
(define
hk-upper?
(fn (c) (and (string? c) (>= (hk-ord c) 65) (<= (hk-ord c) 90))))
(define hk-alpha? (fn (c) (or (hk-lower? c) (hk-upper? c))))
(define
hk-ident-char?
(fn (c) (or (hk-alpha? c) (hk-digit? c) (= c "'"))))
(define
hk-symbol-char?
(fn
(c)
(or
(= c "!")
(= c "#")
(= c "$")
(= c "%")
(= c "&")
(= c "*")
(= c "+")
(= c ".")
(= c "/")
(= c "<")
(= c "=")
(= c ">")
(= c "?")
(= c "@")
(= c "\\")
(= c "^")
(= c "|")
(= c "-")
(= c "~")
(= c ":"))))
(define hk-space? (fn (c) (or (= c " ") (= c "\t"))))
;; ── Hex / oct parser (parse-int is decimal only) ──────────────────
(define
hk-parse-radix
(fn
(s radix)
(let
((n-len (len s)) (idx 0) (acc 0))
(define
hk-rad-loop
(fn
()
(when
(< idx n-len)
(do
(let
((c (substring s idx (+ idx 1))))
(cond
((and (>= (hk-ord c) 48) (<= (hk-ord c) 57))
(set! acc (+ (* acc radix) (- (hk-ord c) 48))))
((and (>= (hk-ord c) 97) (<= (hk-ord c) 102))
(set! acc (+ (* acc radix) (+ 10 (- (hk-ord c) 97)))))
((and (>= (hk-ord c) 65) (<= (hk-ord c) 70))
(set! acc (+ (* acc radix) (+ 10 (- (hk-ord c) 65)))))))
(set! idx (+ idx 1))
(hk-rad-loop)))))
(hk-rad-loop)
acc)))
(define
hk-parse-float
(fn
(s)
(let
((n-len (len s))
(idx 0)
(sign 1)
(int-part 0)
(frac-part 0)
(frac-div 1)
(exp-sign 1)
(exp-val 0)
(has-exp false))
(when
(and (< idx n-len) (= (substring s idx (+ idx 1)) "-"))
(do (set! sign -1) (set! idx (+ idx 1))))
(when
(and (< idx n-len) (= (substring s idx (+ idx 1)) "+"))
(set! idx (+ idx 1)))
(define
hk-int-loop
(fn
()
(when
(and (< idx n-len) (hk-digit? (substring s idx (+ idx 1))))
(do
(set!
int-part
(+ (* int-part 10) (parse-int (substring s idx (+ idx 1)))))
(set! idx (+ idx 1))
(hk-int-loop)))))
(hk-int-loop)
(when
(and (< idx n-len) (= (substring s idx (+ idx 1)) "."))
(do
(set! idx (+ idx 1))
(define
hk-frac-loop
(fn
()
(when
(and (< idx n-len) (hk-digit? (substring s idx (+ idx 1))))
(do
(set! frac-div (* frac-div 10))
(set!
frac-part
(+
frac-part
(/ (parse-int (substring s idx (+ idx 1))) frac-div)))
(set! idx (+ idx 1))
(hk-frac-loop)))))
(hk-frac-loop)))
(when
(and
(< idx n-len)
(let
((c (substring s idx (+ idx 1))))
(or (= c "e") (= c "E"))))
(do
(set! has-exp true)
(set! idx (+ idx 1))
(cond
((and (< idx n-len) (= (substring s idx (+ idx 1)) "-"))
(do (set! exp-sign -1) (set! idx (+ idx 1))))
((and (< idx n-len) (= (substring s idx (+ idx 1)) "+"))
(set! idx (+ idx 1))))
(define
hk-exp-loop
(fn
()
(when
(and (< idx n-len) (hk-digit? (substring s idx (+ idx 1))))
(do
(set!
exp-val
(+
(* exp-val 10)
(parse-int (substring s idx (+ idx 1)))))
(set! idx (+ idx 1))
(hk-exp-loop)))))
(hk-exp-loop)))
(let
((base (* sign (+ int-part frac-part))))
(if has-exp (* base (pow 10 (* exp-sign exp-val))) base)))))
;; ── Reserved words / ops ──────────────────────────────────────────
(define
hk-reserved-words
(list
"case"
"class"
"data"
"default"
"deriving"
"do"
"else"
"foreign"
"if"
"import"
"in"
"infix"
"infixl"
"infixr"
"instance"
"let"
"module"
"newtype"
"of"
"then"
"type"
"where"
"_"))
(define hk-reserved? (fn (w) (contains? hk-reserved-words w)))
(define
hk-reserved-ops
(list ".." ":" "::" "=" "\\" "|" "<-" "->" "@" "~" "=>"))
(define hk-reserved-op? (fn (w) (contains? hk-reserved-ops w)))
;; ── Token constructor ─────────────────────────────────────────────
(define hk-make-token (fn (type value line col) {:line line :value value :col col :type type}))
;; ── Main tokenizer ────────────────────────────────────────────────
(define
hk-tokenize
(fn
(src)
(let
((tokens (list)) (pos 0) (line 1) (col 1) (src-len (len src)))
(define
hk-peek
(fn
(offset)
(if
(< (+ pos offset) src-len)
(substring src (+ pos offset) (+ pos offset 1))
nil)))
(define hk-cur (fn () (hk-peek 0)))
(define
hk-advance!
(fn
()
(let
((c (hk-cur)))
(set! pos (+ pos 1))
(if
(= c "\n")
(do (set! line (+ line 1)) (set! col 1))
(set! col (+ col 1))))))
(define
hk-advance-n!
(fn
(n)
(when (> n 0) (do (hk-advance!) (hk-advance-n! (- n 1))))))
(define
hk-push!
(fn
(type value tok-line tok-col)
(append! tokens (hk-make-token type value tok-line tok-col))))
(define
hk-read-while
(fn
(pred)
(let
((start pos))
(define
hk-rw-loop
(fn
()
(when
(and (< pos src-len) (pred (hk-cur)))
(do (hk-advance!) (hk-rw-loop)))))
(hk-rw-loop)
(substring src start pos))))
(define
hk-skip-line-comment!
(fn
()
(define
hk-slc-loop
(fn
()
(when
(and (< pos src-len) (not (= (hk-cur) "\n")))
(do (hk-advance!) (hk-slc-loop)))))
(hk-slc-loop)))
(define
hk-skip-block-comment!
(fn
()
(hk-advance-n! 2)
(let
((depth 1))
(define
hk-sbc-loop
(fn
()
(cond
((>= pos src-len) nil)
((and (= (hk-cur) "{") (= (hk-peek 1) "-"))
(do
(hk-advance-n! 2)
(set! depth (+ depth 1))
(hk-sbc-loop)))
((and (= (hk-cur) "-") (= (hk-peek 1) "}"))
(do
(hk-advance-n! 2)
(set! depth (- depth 1))
(when (> depth 0) (hk-sbc-loop))))
(:else (do (hk-advance!) (hk-sbc-loop))))))
(hk-sbc-loop))))
(define
hk-read-escape
(fn
()
(hk-advance!)
(let
((c (hk-cur)))
(cond
((= c "n") (do (hk-advance!) "\n"))
((= c "t") (do (hk-advance!) "\t"))
((= c "r") (do (hk-advance!) "\r"))
((= c "\\") (do (hk-advance!) "\\"))
((= c "'") (do (hk-advance!) "'"))
((= c "\"") (do (hk-advance!) "\""))
((= c "0") (do (hk-advance!) (char-from-code 0)))
((= c "a") (do (hk-advance!) (char-from-code 7)))
((= c "b") (do (hk-advance!) (char-from-code 8)))
((= c "f") (do (hk-advance!) (char-from-code 12)))
((= c "v") (do (hk-advance!) (char-from-code 11)))
((hk-digit? c)
(let
((digits (hk-read-while hk-digit?)))
(char-from-code (parse-int digits))))
(:else (do (hk-advance!) (str "\\" c)))))))
(define
hk-read-string
(fn
()
(let
((parts (list)))
(hk-advance!)
(define
hk-rs-loop
(fn
()
(cond
((>= pos src-len) nil)
((= (hk-cur) "\"") (hk-advance!))
((= (hk-cur) "\\")
(do (append! parts (hk-read-escape)) (hk-rs-loop)))
(:else
(do
(append! parts (hk-cur))
(hk-advance!)
(hk-rs-loop))))))
(hk-rs-loop)
(join "" parts))))
(define
hk-read-char-lit
(fn
()
(hk-advance!)
(let
((c (if (= (hk-cur) "\\") (hk-read-escape) (let ((ch (hk-cur))) (hk-advance!) ch))))
(when (= (hk-cur) "'") (hk-advance!))
c)))
(define
hk-read-number
(fn
(tok-line tok-col)
(let
((start pos))
(cond
((and (= (hk-cur) "0") (or (= (hk-peek 1) "x") (= (hk-peek 1) "X")))
(do
(hk-advance-n! 2)
(let
((hex-start pos))
(hk-read-while hk-hex-digit?)
(hk-push!
"integer"
(hk-parse-radix (substring src hex-start pos) 16)
tok-line
tok-col))))
((and (= (hk-cur) "0") (or (= (hk-peek 1) "o") (= (hk-peek 1) "O")))
(do
(hk-advance-n! 2)
(let
((oct-start pos))
(hk-read-while hk-octal-digit?)
(hk-push!
"integer"
(hk-parse-radix (substring src oct-start pos) 8)
tok-line
tok-col))))
(:else
(do
(hk-read-while hk-digit?)
(let
((is-float false))
(when
(and (= (hk-cur) ".") (hk-digit? (hk-peek 1)))
(do
(set! is-float true)
(hk-advance!)
(hk-read-while hk-digit?)))
(when
(or (= (hk-cur) "e") (= (hk-cur) "E"))
(do
(set! is-float true)
(hk-advance!)
(when
(or (= (hk-cur) "+") (= (hk-cur) "-"))
(hk-advance!))
(hk-read-while hk-digit?)))
(let
((num-str (substring src start pos)))
(if
is-float
(hk-push!
"float"
(hk-parse-float num-str)
tok-line
tok-col)
(hk-push!
"integer"
(parse-int num-str)
tok-line
tok-col))))))))))
(define
hk-read-qualified!
(fn
(tok-line tok-col)
(let
((parts (list)) (w (hk-read-while hk-ident-char?)))
(append! parts w)
(let
((emitted false))
(define
hk-rq-loop
(fn
()
(when
(and
(not emitted)
(= (hk-cur) ".")
(or
(hk-upper? (hk-peek 1))
(hk-lower? (hk-peek 1))
(hk-symbol-char? (hk-peek 1))))
(let
((next (hk-peek 1)))
(cond
((hk-upper? next)
(do
(hk-advance!)
(append! parts ".")
(append! parts (hk-read-while hk-ident-char?))
(hk-rq-loop)))
((hk-lower? next)
(do
(hk-advance!)
(set! emitted true)
(hk-push!
"qvarid"
(str
(join "" parts)
"."
(hk-read-while hk-ident-char?))
tok-line
tok-col)))
((hk-symbol-char? next)
(do
(hk-advance!)
(set! emitted true)
(hk-push!
"varsym"
(str
(join "" parts)
"."
(hk-read-while hk-symbol-char?))
tok-line
tok-col))))))))
(hk-rq-loop)
(when
(not emitted)
(let
((full (join "" parts)))
(if
(string-contains? full ".")
(hk-push! "qconid" full tok-line tok-col)
(hk-push! "conid" full tok-line tok-col))))))))
(define
hk-scan!
(fn
()
(cond
((>= pos src-len) nil)
((hk-space? (hk-cur)) (do (hk-advance!) (hk-scan!)))
((= (hk-cur) "\n")
(do
(let
((l line) (c col))
(hk-advance!)
(hk-push! "newline" nil l c))
(hk-scan!)))
((and (= (hk-cur) "{") (= (hk-peek 1) "-"))
(do (hk-skip-block-comment!) (hk-scan!)))
((and (= (hk-cur) "-") (= (hk-peek 1) "-") (let ((p2 (hk-peek 2))) (or (nil? p2) (= p2 "\n") (not (hk-symbol-char? p2)))))
(do (hk-skip-line-comment!) (hk-scan!)))
((= (hk-cur) "\"")
(do
(let
((l line) (c col))
(hk-push! "string" (hk-read-string) l c))
(hk-scan!)))
((= (hk-cur) "'")
(do
(let
((l line) (c col))
(hk-push! "char" (hk-read-char-lit) l c))
(hk-scan!)))
((hk-digit? (hk-cur))
(do (hk-read-number line col) (hk-scan!)))
((hk-lower? (hk-cur))
(do
(let
((l line) (c col))
(let
((w (hk-read-while hk-ident-char?)))
(if
(hk-reserved? w)
(hk-push! "reserved" w l c)
(hk-push! "varid" w l c))))
(hk-scan!)))
((hk-upper? (hk-cur))
(do
(let ((l line) (c col)) (hk-read-qualified! l c))
(hk-scan!)))
((= (hk-cur) "(")
(do (hk-push! "lparen" "(" line col) (hk-advance!) (hk-scan!)))
((= (hk-cur) ")")
(do (hk-push! "rparen" ")" line col) (hk-advance!) (hk-scan!)))
((= (hk-cur) "[")
(do
(hk-push! "lbracket" "[" line col)
(hk-advance!)
(hk-scan!)))
((= (hk-cur) "]")
(do
(hk-push! "rbracket" "]" line col)
(hk-advance!)
(hk-scan!)))
((= (hk-cur) "{")
(do (hk-push! "lbrace" "{" line col) (hk-advance!) (hk-scan!)))
((= (hk-cur) "}")
(do (hk-push! "rbrace" "}" line col) (hk-advance!) (hk-scan!)))
((= (hk-cur) ",")
(do (hk-push! "comma" "," line col) (hk-advance!) (hk-scan!)))
((= (hk-cur) ";")
(do (hk-push! "semi" ";" line col) (hk-advance!) (hk-scan!)))
((= (hk-cur) "`")
(do
(hk-push! "backtick" "`" line col)
(hk-advance!)
(hk-scan!)))
((hk-symbol-char? (hk-cur))
(do
(let
((l line) (c col))
(let
((first (hk-cur)))
(let
((w (hk-read-while hk-symbol-char?)))
(cond
((hk-reserved-op? w) (hk-push! "reservedop" w l c))
((= first ":") (hk-push! "consym" w l c))
(:else (hk-push! "varsym" w l c))))))
(hk-scan!)))
(:else (do (hk-advance!) (hk-scan!))))))
(hk-scan!)
(hk-push! "eof" nil line col)
tokens)))