Salvaged from worktree-agent-* branches killed during sx-tree MCP outage: - lua: tokenizer + parser + phase-2 transpile (~157 tests) - prolog: tokenizer + parser + unification (72 tests, plan update lost to WIP) - forth: phase-1 reader/interpreter + phase-2 colon/VARIABLE (134 tests) - erlang: tokenizer + parser (114 tests) - haskell: tokenizer + parse tests (43 tests) Cherry-picked file contents only, not branch history, to avoid pulling in unrelated ocaml-vm merge commits that were in those branches' bases.
629 lines
20 KiB
Plaintext
629 lines
20 KiB
Plaintext
;; Haskell tokenizer — produces a token stream from Haskell 98 source.
|
|
;;
|
|
;; Tokens: {:type T :value V :line L :col C}
|
|
;;
|
|
;; Types:
|
|
;; "varid" lowercase ident, e.g. fmap, x, myFunc
|
|
;; "conid" uppercase ident, e.g. Nothing, Just, Map
|
|
;; "qvarid" qualified varid, value holds raw "A.B.foo"
|
|
;; "qconid" qualified conid, e.g. "Data.Map"
|
|
;; "reserved" reserved word — value is the word
|
|
;; "varsym" operator symbol, e.g. +, ++, >>=
|
|
;; "consym" constructor operator (starts with :), e.g. :, :+
|
|
;; "reservedop" reserved operator ("::", "=", "->", "<-", "=>", "|", "\\", "@", "~", "..")
|
|
;; "integer" integer literal (number)
|
|
;; "float" float literal (number)
|
|
;; "char" char literal (string of length 1)
|
|
;; "string" string literal
|
|
;; "lparen" "rparen" "lbracket" "rbracket" "lbrace" "rbrace"
|
|
;; "vlbrace" "vrbrace" "vsemi" virtual layout tokens (inserted later)
|
|
;; "comma" "semi" "backtick"
|
|
;; "newline" a logical line break (used by layout pass; stripped afterwards)
|
|
;; "eof"
|
|
;;
|
|
;; Note: SX `cond`/`when` clauses evaluate ONLY their last expression.
|
|
;; Multi-expression bodies must be wrapped in (do ...). All helpers use
|
|
;; the hk- prefix to avoid clashing with SX evaluator special forms.
|
|
|
|
;; ── Char-code table ───────────────────────────────────────────────
|
|
(define
|
|
hk-ord-table
|
|
(let
|
|
((t (dict)) (i 0))
|
|
(define
|
|
hk-build-table
|
|
(fn
|
|
()
|
|
(when
|
|
(< i 128)
|
|
(do
|
|
(dict-set! t (char-from-code i) i)
|
|
(set! i (+ i 1))
|
|
(hk-build-table)))))
|
|
(hk-build-table)
|
|
t))
|
|
|
|
(define hk-ord (fn (c) (or (get hk-ord-table c) 0)))
|
|
|
|
;; ── Character predicates ──────────────────────────────────────────
|
|
(define
|
|
hk-digit?
|
|
(fn (c) (and (string? c) (>= (hk-ord c) 48) (<= (hk-ord c) 57))))
|
|
|
|
(define
|
|
hk-hex-digit?
|
|
(fn
|
|
(c)
|
|
(and
|
|
(string? c)
|
|
(or
|
|
(and (>= (hk-ord c) 48) (<= (hk-ord c) 57))
|
|
(and (>= (hk-ord c) 97) (<= (hk-ord c) 102))
|
|
(and (>= (hk-ord c) 65) (<= (hk-ord c) 70))))))
|
|
|
|
(define
|
|
hk-octal-digit?
|
|
(fn (c) (and (string? c) (>= (hk-ord c) 48) (<= (hk-ord c) 55))))
|
|
|
|
(define
|
|
hk-lower?
|
|
(fn
|
|
(c)
|
|
(and
|
|
(string? c)
|
|
(or (and (>= (hk-ord c) 97) (<= (hk-ord c) 122)) (= c "_")))))
|
|
|
|
(define
|
|
hk-upper?
|
|
(fn (c) (and (string? c) (>= (hk-ord c) 65) (<= (hk-ord c) 90))))
|
|
|
|
(define hk-alpha? (fn (c) (or (hk-lower? c) (hk-upper? c))))
|
|
|
|
(define
|
|
hk-ident-char?
|
|
(fn (c) (or (hk-alpha? c) (hk-digit? c) (= c "'"))))
|
|
|
|
(define
|
|
hk-symbol-char?
|
|
(fn
|
|
(c)
|
|
(or
|
|
(= c "!")
|
|
(= c "#")
|
|
(= c "$")
|
|
(= c "%")
|
|
(= c "&")
|
|
(= c "*")
|
|
(= c "+")
|
|
(= c ".")
|
|
(= c "/")
|
|
(= c "<")
|
|
(= c "=")
|
|
(= c ">")
|
|
(= c "?")
|
|
(= c "@")
|
|
(= c "\\")
|
|
(= c "^")
|
|
(= c "|")
|
|
(= c "-")
|
|
(= c "~")
|
|
(= c ":"))))
|
|
|
|
(define hk-space? (fn (c) (or (= c " ") (= c "\t"))))
|
|
|
|
;; ── Hex / oct parser (parse-int is decimal only) ──────────────────
|
|
(define
|
|
hk-parse-radix
|
|
(fn
|
|
(s radix)
|
|
(let
|
|
((n-len (len s)) (idx 0) (acc 0))
|
|
(define
|
|
hk-rad-loop
|
|
(fn
|
|
()
|
|
(when
|
|
(< idx n-len)
|
|
(do
|
|
(let
|
|
((c (substring s idx (+ idx 1))))
|
|
(cond
|
|
((and (>= (hk-ord c) 48) (<= (hk-ord c) 57))
|
|
(set! acc (+ (* acc radix) (- (hk-ord c) 48))))
|
|
((and (>= (hk-ord c) 97) (<= (hk-ord c) 102))
|
|
(set! acc (+ (* acc radix) (+ 10 (- (hk-ord c) 97)))))
|
|
((and (>= (hk-ord c) 65) (<= (hk-ord c) 70))
|
|
(set! acc (+ (* acc radix) (+ 10 (- (hk-ord c) 65)))))))
|
|
(set! idx (+ idx 1))
|
|
(hk-rad-loop)))))
|
|
(hk-rad-loop)
|
|
acc)))
|
|
|
|
(define
|
|
hk-parse-float
|
|
(fn
|
|
(s)
|
|
(let
|
|
((n-len (len s))
|
|
(idx 0)
|
|
(sign 1)
|
|
(int-part 0)
|
|
(frac-part 0)
|
|
(frac-div 1)
|
|
(exp-sign 1)
|
|
(exp-val 0)
|
|
(has-exp false))
|
|
(when
|
|
(and (< idx n-len) (= (substring s idx (+ idx 1)) "-"))
|
|
(do (set! sign -1) (set! idx (+ idx 1))))
|
|
(when
|
|
(and (< idx n-len) (= (substring s idx (+ idx 1)) "+"))
|
|
(set! idx (+ idx 1)))
|
|
(define
|
|
hk-int-loop
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< idx n-len) (hk-digit? (substring s idx (+ idx 1))))
|
|
(do
|
|
(set!
|
|
int-part
|
|
(+ (* int-part 10) (parse-int (substring s idx (+ idx 1)))))
|
|
(set! idx (+ idx 1))
|
|
(hk-int-loop)))))
|
|
(hk-int-loop)
|
|
(when
|
|
(and (< idx n-len) (= (substring s idx (+ idx 1)) "."))
|
|
(do
|
|
(set! idx (+ idx 1))
|
|
(define
|
|
hk-frac-loop
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< idx n-len) (hk-digit? (substring s idx (+ idx 1))))
|
|
(do
|
|
(set! frac-div (* frac-div 10))
|
|
(set!
|
|
frac-part
|
|
(+
|
|
frac-part
|
|
(/ (parse-int (substring s idx (+ idx 1))) frac-div)))
|
|
(set! idx (+ idx 1))
|
|
(hk-frac-loop)))))
|
|
(hk-frac-loop)))
|
|
(when
|
|
(and
|
|
(< idx n-len)
|
|
(let
|
|
((c (substring s idx (+ idx 1))))
|
|
(or (= c "e") (= c "E"))))
|
|
(do
|
|
(set! has-exp true)
|
|
(set! idx (+ idx 1))
|
|
(cond
|
|
((and (< idx n-len) (= (substring s idx (+ idx 1)) "-"))
|
|
(do (set! exp-sign -1) (set! idx (+ idx 1))))
|
|
((and (< idx n-len) (= (substring s idx (+ idx 1)) "+"))
|
|
(set! idx (+ idx 1))))
|
|
(define
|
|
hk-exp-loop
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< idx n-len) (hk-digit? (substring s idx (+ idx 1))))
|
|
(do
|
|
(set!
|
|
exp-val
|
|
(+
|
|
(* exp-val 10)
|
|
(parse-int (substring s idx (+ idx 1)))))
|
|
(set! idx (+ idx 1))
|
|
(hk-exp-loop)))))
|
|
(hk-exp-loop)))
|
|
(let
|
|
((base (* sign (+ int-part frac-part))))
|
|
(if has-exp (* base (pow 10 (* exp-sign exp-val))) base)))))
|
|
|
|
;; ── Reserved words / ops ──────────────────────────────────────────
|
|
(define
|
|
hk-reserved-words
|
|
(list
|
|
"case"
|
|
"class"
|
|
"data"
|
|
"default"
|
|
"deriving"
|
|
"do"
|
|
"else"
|
|
"foreign"
|
|
"if"
|
|
"import"
|
|
"in"
|
|
"infix"
|
|
"infixl"
|
|
"infixr"
|
|
"instance"
|
|
"let"
|
|
"module"
|
|
"newtype"
|
|
"of"
|
|
"then"
|
|
"type"
|
|
"where"
|
|
"_"))
|
|
|
|
(define hk-reserved? (fn (w) (contains? hk-reserved-words w)))
|
|
|
|
(define
|
|
hk-reserved-ops
|
|
(list ".." ":" "::" "=" "\\" "|" "<-" "->" "@" "~" "=>"))
|
|
|
|
(define hk-reserved-op? (fn (w) (contains? hk-reserved-ops w)))
|
|
|
|
;; ── Token constructor ─────────────────────────────────────────────
|
|
(define hk-make-token (fn (type value line col) {:line line :value value :col col :type type}))
|
|
|
|
;; ── Main tokenizer ────────────────────────────────────────────────
|
|
(define
|
|
hk-tokenize
|
|
(fn
|
|
(src)
|
|
(let
|
|
((tokens (list)) (pos 0) (line 1) (col 1) (src-len (len src)))
|
|
(define
|
|
hk-peek
|
|
(fn
|
|
(offset)
|
|
(if
|
|
(< (+ pos offset) src-len)
|
|
(substring src (+ pos offset) (+ pos offset 1))
|
|
nil)))
|
|
(define hk-cur (fn () (hk-peek 0)))
|
|
(define
|
|
hk-advance!
|
|
(fn
|
|
()
|
|
(let
|
|
((c (hk-cur)))
|
|
(set! pos (+ pos 1))
|
|
(if
|
|
(= c "\n")
|
|
(do (set! line (+ line 1)) (set! col 1))
|
|
(set! col (+ col 1))))))
|
|
(define
|
|
hk-advance-n!
|
|
(fn
|
|
(n)
|
|
(when (> n 0) (do (hk-advance!) (hk-advance-n! (- n 1))))))
|
|
(define
|
|
hk-push!
|
|
(fn
|
|
(type value tok-line tok-col)
|
|
(append! tokens (hk-make-token type value tok-line tok-col))))
|
|
(define
|
|
hk-read-while
|
|
(fn
|
|
(pred)
|
|
(let
|
|
((start pos))
|
|
(define
|
|
hk-rw-loop
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< pos src-len) (pred (hk-cur)))
|
|
(do (hk-advance!) (hk-rw-loop)))))
|
|
(hk-rw-loop)
|
|
(substring src start pos))))
|
|
(define
|
|
hk-skip-line-comment!
|
|
(fn
|
|
()
|
|
(define
|
|
hk-slc-loop
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< pos src-len) (not (= (hk-cur) "\n")))
|
|
(do (hk-advance!) (hk-slc-loop)))))
|
|
(hk-slc-loop)))
|
|
(define
|
|
hk-skip-block-comment!
|
|
(fn
|
|
()
|
|
(hk-advance-n! 2)
|
|
(let
|
|
((depth 1))
|
|
(define
|
|
hk-sbc-loop
|
|
(fn
|
|
()
|
|
(cond
|
|
((>= pos src-len) nil)
|
|
((and (= (hk-cur) "{") (= (hk-peek 1) "-"))
|
|
(do
|
|
(hk-advance-n! 2)
|
|
(set! depth (+ depth 1))
|
|
(hk-sbc-loop)))
|
|
((and (= (hk-cur) "-") (= (hk-peek 1) "}"))
|
|
(do
|
|
(hk-advance-n! 2)
|
|
(set! depth (- depth 1))
|
|
(when (> depth 0) (hk-sbc-loop))))
|
|
(:else (do (hk-advance!) (hk-sbc-loop))))))
|
|
(hk-sbc-loop))))
|
|
(define
|
|
hk-read-escape
|
|
(fn
|
|
()
|
|
(hk-advance!)
|
|
(let
|
|
((c (hk-cur)))
|
|
(cond
|
|
((= c "n") (do (hk-advance!) "\n"))
|
|
((= c "t") (do (hk-advance!) "\t"))
|
|
((= c "r") (do (hk-advance!) "\r"))
|
|
((= c "\\") (do (hk-advance!) "\\"))
|
|
((= c "'") (do (hk-advance!) "'"))
|
|
((= c "\"") (do (hk-advance!) "\""))
|
|
((= c "0") (do (hk-advance!) (char-from-code 0)))
|
|
((= c "a") (do (hk-advance!) (char-from-code 7)))
|
|
((= c "b") (do (hk-advance!) (char-from-code 8)))
|
|
((= c "f") (do (hk-advance!) (char-from-code 12)))
|
|
((= c "v") (do (hk-advance!) (char-from-code 11)))
|
|
((hk-digit? c)
|
|
(let
|
|
((digits (hk-read-while hk-digit?)))
|
|
(char-from-code (parse-int digits))))
|
|
(:else (do (hk-advance!) (str "\\" c)))))))
|
|
(define
|
|
hk-read-string
|
|
(fn
|
|
()
|
|
(let
|
|
((parts (list)))
|
|
(hk-advance!)
|
|
(define
|
|
hk-rs-loop
|
|
(fn
|
|
()
|
|
(cond
|
|
((>= pos src-len) nil)
|
|
((= (hk-cur) "\"") (hk-advance!))
|
|
((= (hk-cur) "\\")
|
|
(do (append! parts (hk-read-escape)) (hk-rs-loop)))
|
|
(:else
|
|
(do
|
|
(append! parts (hk-cur))
|
|
(hk-advance!)
|
|
(hk-rs-loop))))))
|
|
(hk-rs-loop)
|
|
(join "" parts))))
|
|
(define
|
|
hk-read-char-lit
|
|
(fn
|
|
()
|
|
(hk-advance!)
|
|
(let
|
|
((c (if (= (hk-cur) "\\") (hk-read-escape) (let ((ch (hk-cur))) (hk-advance!) ch))))
|
|
(when (= (hk-cur) "'") (hk-advance!))
|
|
c)))
|
|
(define
|
|
hk-read-number
|
|
(fn
|
|
(tok-line tok-col)
|
|
(let
|
|
((start pos))
|
|
(cond
|
|
((and (= (hk-cur) "0") (or (= (hk-peek 1) "x") (= (hk-peek 1) "X")))
|
|
(do
|
|
(hk-advance-n! 2)
|
|
(let
|
|
((hex-start pos))
|
|
(hk-read-while hk-hex-digit?)
|
|
(hk-push!
|
|
"integer"
|
|
(hk-parse-radix (substring src hex-start pos) 16)
|
|
tok-line
|
|
tok-col))))
|
|
((and (= (hk-cur) "0") (or (= (hk-peek 1) "o") (= (hk-peek 1) "O")))
|
|
(do
|
|
(hk-advance-n! 2)
|
|
(let
|
|
((oct-start pos))
|
|
(hk-read-while hk-octal-digit?)
|
|
(hk-push!
|
|
"integer"
|
|
(hk-parse-radix (substring src oct-start pos) 8)
|
|
tok-line
|
|
tok-col))))
|
|
(:else
|
|
(do
|
|
(hk-read-while hk-digit?)
|
|
(let
|
|
((is-float false))
|
|
(when
|
|
(and (= (hk-cur) ".") (hk-digit? (hk-peek 1)))
|
|
(do
|
|
(set! is-float true)
|
|
(hk-advance!)
|
|
(hk-read-while hk-digit?)))
|
|
(when
|
|
(or (= (hk-cur) "e") (= (hk-cur) "E"))
|
|
(do
|
|
(set! is-float true)
|
|
(hk-advance!)
|
|
(when
|
|
(or (= (hk-cur) "+") (= (hk-cur) "-"))
|
|
(hk-advance!))
|
|
(hk-read-while hk-digit?)))
|
|
(let
|
|
((num-str (substring src start pos)))
|
|
(if
|
|
is-float
|
|
(hk-push!
|
|
"float"
|
|
(hk-parse-float num-str)
|
|
tok-line
|
|
tok-col)
|
|
(hk-push!
|
|
"integer"
|
|
(parse-int num-str)
|
|
tok-line
|
|
tok-col))))))))))
|
|
(define
|
|
hk-read-qualified!
|
|
(fn
|
|
(tok-line tok-col)
|
|
(let
|
|
((parts (list)) (w (hk-read-while hk-ident-char?)))
|
|
(append! parts w)
|
|
(let
|
|
((emitted false))
|
|
(define
|
|
hk-rq-loop
|
|
(fn
|
|
()
|
|
(when
|
|
(and
|
|
(not emitted)
|
|
(= (hk-cur) ".")
|
|
(or
|
|
(hk-upper? (hk-peek 1))
|
|
(hk-lower? (hk-peek 1))
|
|
(hk-symbol-char? (hk-peek 1))))
|
|
(let
|
|
((next (hk-peek 1)))
|
|
(cond
|
|
((hk-upper? next)
|
|
(do
|
|
(hk-advance!)
|
|
(append! parts ".")
|
|
(append! parts (hk-read-while hk-ident-char?))
|
|
(hk-rq-loop)))
|
|
((hk-lower? next)
|
|
(do
|
|
(hk-advance!)
|
|
(set! emitted true)
|
|
(hk-push!
|
|
"qvarid"
|
|
(str
|
|
(join "" parts)
|
|
"."
|
|
(hk-read-while hk-ident-char?))
|
|
tok-line
|
|
tok-col)))
|
|
((hk-symbol-char? next)
|
|
(do
|
|
(hk-advance!)
|
|
(set! emitted true)
|
|
(hk-push!
|
|
"varsym"
|
|
(str
|
|
(join "" parts)
|
|
"."
|
|
(hk-read-while hk-symbol-char?))
|
|
tok-line
|
|
tok-col))))))))
|
|
(hk-rq-loop)
|
|
(when
|
|
(not emitted)
|
|
(let
|
|
((full (join "" parts)))
|
|
(if
|
|
(string-contains? full ".")
|
|
(hk-push! "qconid" full tok-line tok-col)
|
|
(hk-push! "conid" full tok-line tok-col))))))))
|
|
(define
|
|
hk-scan!
|
|
(fn
|
|
()
|
|
(cond
|
|
((>= pos src-len) nil)
|
|
((hk-space? (hk-cur)) (do (hk-advance!) (hk-scan!)))
|
|
((= (hk-cur) "\n")
|
|
(do
|
|
(let
|
|
((l line) (c col))
|
|
(hk-advance!)
|
|
(hk-push! "newline" nil l c))
|
|
(hk-scan!)))
|
|
((and (= (hk-cur) "{") (= (hk-peek 1) "-"))
|
|
(do (hk-skip-block-comment!) (hk-scan!)))
|
|
((and (= (hk-cur) "-") (= (hk-peek 1) "-") (let ((p2 (hk-peek 2))) (or (nil? p2) (= p2 "\n") (not (hk-symbol-char? p2)))))
|
|
(do (hk-skip-line-comment!) (hk-scan!)))
|
|
((= (hk-cur) "\"")
|
|
(do
|
|
(let
|
|
((l line) (c col))
|
|
(hk-push! "string" (hk-read-string) l c))
|
|
(hk-scan!)))
|
|
((= (hk-cur) "'")
|
|
(do
|
|
(let
|
|
((l line) (c col))
|
|
(hk-push! "char" (hk-read-char-lit) l c))
|
|
(hk-scan!)))
|
|
((hk-digit? (hk-cur))
|
|
(do (hk-read-number line col) (hk-scan!)))
|
|
((hk-lower? (hk-cur))
|
|
(do
|
|
(let
|
|
((l line) (c col))
|
|
(let
|
|
((w (hk-read-while hk-ident-char?)))
|
|
(if
|
|
(hk-reserved? w)
|
|
(hk-push! "reserved" w l c)
|
|
(hk-push! "varid" w l c))))
|
|
(hk-scan!)))
|
|
((hk-upper? (hk-cur))
|
|
(do
|
|
(let ((l line) (c col)) (hk-read-qualified! l c))
|
|
(hk-scan!)))
|
|
((= (hk-cur) "(")
|
|
(do (hk-push! "lparen" "(" line col) (hk-advance!) (hk-scan!)))
|
|
((= (hk-cur) ")")
|
|
(do (hk-push! "rparen" ")" line col) (hk-advance!) (hk-scan!)))
|
|
((= (hk-cur) "[")
|
|
(do
|
|
(hk-push! "lbracket" "[" line col)
|
|
(hk-advance!)
|
|
(hk-scan!)))
|
|
((= (hk-cur) "]")
|
|
(do
|
|
(hk-push! "rbracket" "]" line col)
|
|
(hk-advance!)
|
|
(hk-scan!)))
|
|
((= (hk-cur) "{")
|
|
(do (hk-push! "lbrace" "{" line col) (hk-advance!) (hk-scan!)))
|
|
((= (hk-cur) "}")
|
|
(do (hk-push! "rbrace" "}" line col) (hk-advance!) (hk-scan!)))
|
|
((= (hk-cur) ",")
|
|
(do (hk-push! "comma" "," line col) (hk-advance!) (hk-scan!)))
|
|
((= (hk-cur) ";")
|
|
(do (hk-push! "semi" ";" line col) (hk-advance!) (hk-scan!)))
|
|
((= (hk-cur) "`")
|
|
(do
|
|
(hk-push! "backtick" "`" line col)
|
|
(hk-advance!)
|
|
(hk-scan!)))
|
|
((hk-symbol-char? (hk-cur))
|
|
(do
|
|
(let
|
|
((l line) (c col))
|
|
(let
|
|
((first (hk-cur)))
|
|
(let
|
|
((w (hk-read-while hk-symbol-char?)))
|
|
(cond
|
|
((hk-reserved-op? w) (hk-push! "reservedop" w l c))
|
|
((= first ":") (hk-push! "consym" w l c))
|
|
(:else (hk-push! "varsym" w l c))))))
|
|
(hk-scan!)))
|
|
(:else (do (hk-advance!) (hk-scan!))))))
|
|
(hk-scan!)
|
|
(hk-push! "eof" nil line col)
|
|
tokens)))
|