;; Haskell tokenizer — produces a token stream from Haskell 98 source. ;; ;; Tokens: {:type T :value V :line L :col C} ;; ;; Types: ;; "varid" lowercase ident, e.g. fmap, x, myFunc ;; "conid" uppercase ident, e.g. Nothing, Just, Map ;; "qvarid" qualified varid, value holds raw "A.B.foo" ;; "qconid" qualified conid, e.g. "Data.Map" ;; "reserved" reserved word — value is the word ;; "varsym" operator symbol, e.g. +, ++, >>= ;; "consym" constructor operator (starts with :), e.g. :, :+ ;; "reservedop" reserved operator ("::", "=", "->", "<-", "=>", "|", "\\", "@", "~", "..") ;; "integer" integer literal (number) ;; "float" float literal (number) ;; "char" char literal (string of length 1) ;; "string" string literal ;; "lparen" "rparen" "lbracket" "rbracket" "lbrace" "rbrace" ;; "vlbrace" "vrbrace" "vsemi" virtual layout tokens (inserted later) ;; "comma" "semi" "backtick" ;; "newline" a logical line break (used by layout pass; stripped afterwards) ;; "eof" ;; ;; Note: SX `cond`/`when` clauses evaluate ONLY their last expression. ;; Multi-expression bodies must be wrapped in (do ...). All helpers use ;; the hk- prefix to avoid clashing with SX evaluator special forms. ;; ── Char-code table ─────────────────────────────────────────────── (define hk-ord-table (let ((t (dict)) (i 0)) (define hk-build-table (fn () (when (< i 128) (do (dict-set! t (char-from-code i) i) (set! i (+ i 1)) (hk-build-table))))) (hk-build-table) t)) (define hk-ord (fn (c) (or (get hk-ord-table c) 0))) ;; ── Character predicates ────────────────────────────────────────── (define hk-digit? (fn (c) (and (string? c) (>= (hk-ord c) 48) (<= (hk-ord c) 57)))) (define hk-hex-digit? (fn (c) (and (string? c) (or (and (>= (hk-ord c) 48) (<= (hk-ord c) 57)) (and (>= (hk-ord c) 97) (<= (hk-ord c) 102)) (and (>= (hk-ord c) 65) (<= (hk-ord c) 70)))))) (define hk-octal-digit? (fn (c) (and (string? c) (>= (hk-ord c) 48) (<= (hk-ord c) 55)))) (define hk-lower? (fn (c) (and (string? c) (or (and (>= (hk-ord c) 97) (<= (hk-ord c) 122)) (= c "_"))))) (define hk-upper? (fn (c) (and (string? c) (>= (hk-ord c) 65) (<= (hk-ord c) 90)))) (define hk-alpha? (fn (c) (or (hk-lower? c) (hk-upper? c)))) (define hk-ident-char? (fn (c) (or (hk-alpha? c) (hk-digit? c) (= c "'")))) (define hk-symbol-char? (fn (c) (or (= c "!") (= c "#") (= c "$") (= c "%") (= c "&") (= c "*") (= c "+") (= c ".") (= c "/") (= c "<") (= c "=") (= c ">") (= c "?") (= c "@") (= c "\\") (= c "^") (= c "|") (= c "-") (= c "~") (= c ":")))) (define hk-space? (fn (c) (or (= c " ") (= c "\t")))) ;; ── Hex / oct parser (parse-int is decimal only) ────────────────── (define hk-parse-radix (fn (s radix) (let ((n-len (len s)) (idx 0) (acc 0)) (define hk-rad-loop (fn () (when (< idx n-len) (do (let ((c (substring s idx (+ idx 1)))) (cond ((and (>= (hk-ord c) 48) (<= (hk-ord c) 57)) (set! acc (+ (* acc radix) (- (hk-ord c) 48)))) ((and (>= (hk-ord c) 97) (<= (hk-ord c) 102)) (set! acc (+ (* acc radix) (+ 10 (- (hk-ord c) 97))))) ((and (>= (hk-ord c) 65) (<= (hk-ord c) 70)) (set! acc (+ (* acc radix) (+ 10 (- (hk-ord c) 65))))))) (set! idx (+ idx 1)) (hk-rad-loop))))) (hk-rad-loop) acc))) (define hk-parse-float (fn (s) (let ((n-len (len s)) (idx 0) (sign 1) (int-part 0) (frac-part 0) (frac-div 1) (exp-sign 1) (exp-val 0) (has-exp false)) (when (and (< idx n-len) (= (substring s idx (+ idx 1)) "-")) (do (set! sign -1) (set! idx (+ idx 1)))) (when (and (< idx n-len) (= (substring s idx (+ idx 1)) "+")) (set! idx (+ idx 1))) (define hk-int-loop (fn () (when (and (< idx n-len) (hk-digit? (substring s idx (+ idx 1)))) (do (set! int-part (+ (* int-part 10) (parse-int (substring s idx (+ idx 1))))) (set! idx (+ idx 1)) (hk-int-loop))))) (hk-int-loop) (when (and (< idx n-len) (= (substring s idx (+ idx 1)) ".")) (do (set! idx (+ idx 1)) (define hk-frac-loop (fn () (when (and (< idx n-len) (hk-digit? (substring s idx (+ idx 1)))) (do (set! frac-div (* frac-div 10)) (set! frac-part (+ frac-part (/ (parse-int (substring s idx (+ idx 1))) frac-div))) (set! idx (+ idx 1)) (hk-frac-loop))))) (hk-frac-loop))) (when (and (< idx n-len) (let ((c (substring s idx (+ idx 1)))) (or (= c "e") (= c "E")))) (do (set! has-exp true) (set! idx (+ idx 1)) (cond ((and (< idx n-len) (= (substring s idx (+ idx 1)) "-")) (do (set! exp-sign -1) (set! idx (+ idx 1)))) ((and (< idx n-len) (= (substring s idx (+ idx 1)) "+")) (set! idx (+ idx 1)))) (define hk-exp-loop (fn () (when (and (< idx n-len) (hk-digit? (substring s idx (+ idx 1)))) (do (set! exp-val (+ (* exp-val 10) (parse-int (substring s idx (+ idx 1))))) (set! idx (+ idx 1)) (hk-exp-loop))))) (hk-exp-loop))) (let ((base (* sign (+ int-part frac-part)))) (if has-exp (* base (pow 10 (* exp-sign exp-val))) base))))) ;; ── Reserved words / ops ────────────────────────────────────────── (define hk-reserved-words (list "case" "class" "data" "default" "deriving" "do" "else" "foreign" "if" "import" "in" "infix" "infixl" "infixr" "instance" "let" "module" "newtype" "of" "then" "type" "where" "_")) (define hk-reserved? (fn (w) (contains? hk-reserved-words w))) (define hk-reserved-ops (list ".." ":" "::" "=" "\\" "|" "<-" "->" "@" "~" "=>")) (define hk-reserved-op? (fn (w) (contains? hk-reserved-ops w))) ;; ── Token constructor ───────────────────────────────────────────── (define hk-make-token (fn (type value line col) {:line line :value value :col col :type type})) ;; ── Main tokenizer ──────────────────────────────────────────────── (define hk-tokenize (fn (src) (let ((tokens (list)) (pos 0) (line 1) (col 1) (src-len (len src))) (define hk-peek (fn (offset) (if (< (+ pos offset) src-len) (substring src (+ pos offset) (+ pos offset 1)) nil))) (define hk-cur (fn () (hk-peek 0))) (define hk-advance! (fn () (let ((c (hk-cur))) (set! pos (+ pos 1)) (if (= c "\n") (do (set! line (+ line 1)) (set! col 1)) (set! col (+ col 1)))))) (define hk-advance-n! (fn (n) (when (> n 0) (do (hk-advance!) (hk-advance-n! (- n 1)))))) (define hk-push! (fn (type value tok-line tok-col) (append! tokens (hk-make-token type value tok-line tok-col)))) (define hk-read-while (fn (pred) (let ((start pos)) (define hk-rw-loop (fn () (when (and (< pos src-len) (pred (hk-cur))) (do (hk-advance!) (hk-rw-loop))))) (hk-rw-loop) (substring src start pos)))) (define hk-skip-line-comment! (fn () (define hk-slc-loop (fn () (when (and (< pos src-len) (not (= (hk-cur) "\n"))) (do (hk-advance!) (hk-slc-loop))))) (hk-slc-loop))) (define hk-skip-block-comment! (fn () (hk-advance-n! 2) (let ((depth 1)) (define hk-sbc-loop (fn () (cond ((>= pos src-len) nil) ((and (= (hk-cur) "{") (= (hk-peek 1) "-")) (do (hk-advance-n! 2) (set! depth (+ depth 1)) (hk-sbc-loop))) ((and (= (hk-cur) "-") (= (hk-peek 1) "}")) (do (hk-advance-n! 2) (set! depth (- depth 1)) (when (> depth 0) (hk-sbc-loop)))) (:else (do (hk-advance!) (hk-sbc-loop)))))) (hk-sbc-loop)))) (define hk-read-escape (fn () (hk-advance!) (let ((c (hk-cur))) (cond ((= c "n") (do (hk-advance!) "\n")) ((= c "t") (do (hk-advance!) "\t")) ((= c "r") (do (hk-advance!) "\r")) ((= c "\\") (do (hk-advance!) "\\")) ((= c "'") (do (hk-advance!) "'")) ((= c "\"") (do (hk-advance!) "\"")) ((= c "0") (do (hk-advance!) (char-from-code 0))) ((= c "a") (do (hk-advance!) (char-from-code 7))) ((= c "b") (do (hk-advance!) (char-from-code 8))) ((= c "f") (do (hk-advance!) (char-from-code 12))) ((= c "v") (do (hk-advance!) (char-from-code 11))) ((hk-digit? c) (let ((digits (hk-read-while hk-digit?))) (char-from-code (parse-int digits)))) (:else (do (hk-advance!) (str "\\" c))))))) (define hk-read-string (fn () (let ((parts (list))) (hk-advance!) (define hk-rs-loop (fn () (cond ((>= pos src-len) nil) ((= (hk-cur) "\"") (hk-advance!)) ((= (hk-cur) "\\") (do (append! parts (hk-read-escape)) (hk-rs-loop))) (:else (do (append! parts (hk-cur)) (hk-advance!) (hk-rs-loop)))))) (hk-rs-loop) (join "" parts)))) (define hk-read-char-lit (fn () (hk-advance!) (let ((c (if (= (hk-cur) "\\") (hk-read-escape) (let ((ch (hk-cur))) (hk-advance!) ch)))) (when (= (hk-cur) "'") (hk-advance!)) c))) (define hk-read-number (fn (tok-line tok-col) (let ((start pos)) (cond ((and (= (hk-cur) "0") (or (= (hk-peek 1) "x") (= (hk-peek 1) "X"))) (do (hk-advance-n! 2) (let ((hex-start pos)) (hk-read-while hk-hex-digit?) (hk-push! "integer" (hk-parse-radix (substring src hex-start pos) 16) tok-line tok-col)))) ((and (= (hk-cur) "0") (or (= (hk-peek 1) "o") (= (hk-peek 1) "O"))) (do (hk-advance-n! 2) (let ((oct-start pos)) (hk-read-while hk-octal-digit?) (hk-push! "integer" (hk-parse-radix (substring src oct-start pos) 8) tok-line tok-col)))) (:else (do (hk-read-while hk-digit?) (let ((is-float false)) (when (and (= (hk-cur) ".") (hk-digit? (hk-peek 1))) (do (set! is-float true) (hk-advance!) (hk-read-while hk-digit?))) (when (or (= (hk-cur) "e") (= (hk-cur) "E")) (do (set! is-float true) (hk-advance!) (when (or (= (hk-cur) "+") (= (hk-cur) "-")) (hk-advance!)) (hk-read-while hk-digit?))) (let ((num-str (substring src start pos))) (if is-float (hk-push! "float" (hk-parse-float num-str) tok-line tok-col) (hk-push! "integer" (parse-int num-str) tok-line tok-col)))))))))) (define hk-read-qualified! (fn (tok-line tok-col) (let ((parts (list)) (w (hk-read-while hk-ident-char?))) (append! parts w) (let ((emitted false)) (define hk-rq-loop (fn () (when (and (not emitted) (= (hk-cur) ".") (or (hk-upper? (hk-peek 1)) (hk-lower? (hk-peek 1)) (hk-symbol-char? (hk-peek 1)))) (let ((next (hk-peek 1))) (cond ((hk-upper? next) (do (hk-advance!) (append! parts ".") (append! parts (hk-read-while hk-ident-char?)) (hk-rq-loop))) ((hk-lower? next) (do (hk-advance!) (set! emitted true) (hk-push! "qvarid" (str (join "" parts) "." (hk-read-while hk-ident-char?)) tok-line tok-col))) ((hk-symbol-char? next) (do (hk-advance!) (set! emitted true) (hk-push! "varsym" (str (join "" parts) "." (hk-read-while hk-symbol-char?)) tok-line tok-col)))))))) (hk-rq-loop) (when (not emitted) (let ((full (join "" parts))) (if (string-contains? full ".") (hk-push! "qconid" full tok-line tok-col) (hk-push! "conid" full tok-line tok-col)))))))) (define hk-scan! (fn () (cond ((>= pos src-len) nil) ((hk-space? (hk-cur)) (do (hk-advance!) (hk-scan!))) ((= (hk-cur) "\n") (do (let ((l line) (c col)) (hk-advance!) (hk-push! "newline" nil l c)) (hk-scan!))) ((and (= (hk-cur) "{") (= (hk-peek 1) "-")) (do (hk-skip-block-comment!) (hk-scan!))) ((and (= (hk-cur) "-") (= (hk-peek 1) "-") (let ((p2 (hk-peek 2))) (or (nil? p2) (= p2 "\n") (not (hk-symbol-char? p2))))) (do (hk-skip-line-comment!) (hk-scan!))) ((= (hk-cur) "\"") (do (let ((l line) (c col)) (hk-push! "string" (hk-read-string) l c)) (hk-scan!))) ((= (hk-cur) "'") (do (let ((l line) (c col)) (hk-push! "char" (hk-read-char-lit) l c)) (hk-scan!))) ((hk-digit? (hk-cur)) (do (hk-read-number line col) (hk-scan!))) ((hk-lower? (hk-cur)) (do (let ((l line) (c col)) (let ((w (hk-read-while hk-ident-char?))) (if (hk-reserved? w) (hk-push! "reserved" w l c) (hk-push! "varid" w l c)))) (hk-scan!))) ((hk-upper? (hk-cur)) (do (let ((l line) (c col)) (hk-read-qualified! l c)) (hk-scan!))) ((= (hk-cur) "(") (do (hk-push! "lparen" "(" line col) (hk-advance!) (hk-scan!))) ((= (hk-cur) ")") (do (hk-push! "rparen" ")" line col) (hk-advance!) (hk-scan!))) ((= (hk-cur) "[") (do (hk-push! "lbracket" "[" line col) (hk-advance!) (hk-scan!))) ((= (hk-cur) "]") (do (hk-push! "rbracket" "]" line col) (hk-advance!) (hk-scan!))) ((= (hk-cur) "{") (do (hk-push! "lbrace" "{" line col) (hk-advance!) (hk-scan!))) ((= (hk-cur) "}") (do (hk-push! "rbrace" "}" line col) (hk-advance!) (hk-scan!))) ((= (hk-cur) ",") (do (hk-push! "comma" "," line col) (hk-advance!) (hk-scan!))) ((= (hk-cur) ";") (do (hk-push! "semi" ";" line col) (hk-advance!) (hk-scan!))) ((= (hk-cur) "`") (do (hk-push! "backtick" "`" line col) (hk-advance!) (hk-scan!))) ((hk-symbol-char? (hk-cur)) (do (let ((l line) (c col)) (let ((first (hk-cur))) (let ((w (hk-read-while hk-symbol-char?))) (cond ((hk-reserved-op? w) (hk-push! "reservedop" w l c)) ((= first ":") (hk-push! "consym" w l c)) (:else (hk-push! "varsym" w l c)))))) (hk-scan!))) (:else (do (hk-advance!) (hk-scan!)))))) (hk-scan!) (hk-push! "eof" nil line col) tokens)))