Files
rose-ash/lib/erlang/tokenizer.sx
giles 3d80bd8ce6
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 31s
fed-sx-m1: Step 3b substrate fix #2 — $X char literals decode to char code in tokenizer (+12 eval, 750/750)
2026-06-04 22:50:35 +00:00

359 lines
12 KiB
Plaintext

;; Erlang tokenizer — produces token stream from Erlang source
;;
;; Tokens: {:type T :value V :pos P}
;; Types:
;; "atom" — foo, 'Quoted Atom'
;; "var" — X, Foo, _Bar, _ (wildcard)
;; "integer" — 42, 16#FF, $c (char literal)
;; "float" — 3.14, 1.0e10
;; "string" — "..."
;; "keyword" — case of end if when receive after fun try catch
;; begin do let module export import define andalso orelse
;; not div rem bnot band bor bxor bsl bsr
;; "punct" — ( ) { } [ ] , ; . : :: -> <- <= => | ||
;; << >>
;; "op" — + - * / = == /= =:= =/= < > =< >= ++ -- ! ?
;; "eof"
(define er-make-token (fn (type value pos) {:pos pos :value value :type type}))
(define er-digit? (fn (c) (and (>= c "0") (<= c "9"))))
(define
er-hex-digit?
(fn
(c)
(or
(er-digit? c)
(and (>= c "a") (<= c "f"))
(and (>= c "A") (<= c "F")))))
(define er-lower? (fn (c) (and (>= c "a") (<= c "z"))))
(define er-upper? (fn (c) (and (>= c "A") (<= c "Z"))))
(define er-letter? (fn (c) (or (er-lower? c) (er-upper? c))))
(define
er-ident-char?
(fn (c) (or (er-letter? c) (er-digit? c) (= c "_") (= c "@"))))
(define er-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r"))))
;; Erlang reserved words — everything else starting lowercase is an atom
(define
er-keywords
(list
"after"
"and"
"andalso"
"band"
"begin"
"bnot"
"bor"
"bsl"
"bsr"
"bxor"
"case"
"catch"
"cond"
"div"
"end"
"fun"
"if"
"let"
"not"
"of"
"or"
"orelse"
"receive"
"rem"
"try"
"when"
"xor"))
(define er-keyword? (fn (word) (some (fn (k) (= k word)) er-keywords)))
(define
er-tokenize
(fn
(src)
(let
((tokens (list)) (pos 0) (src-len (len src)))
(define
er-peek
(fn
(offset)
(if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil)))
(define er-cur (fn () (er-peek 0)))
(define er-advance! (fn (n) (set! pos (+ pos n))))
(define
skip-ws!
(fn
()
(when
(and (< pos src-len) (er-ws? (er-cur)))
(er-advance! 1)
(skip-ws!))))
(define
skip-comment!
(fn
()
(when
(and (< pos src-len) (not (= (er-cur) "\n")))
(er-advance! 1)
(skip-comment!))))
(define
read-ident-chars
(fn
(start)
(when
(and (< pos src-len) (er-ident-char? (er-cur)))
(er-advance! 1)
(read-ident-chars start))
(slice src start pos)))
(define
read-integer-digits
(fn
()
(when
(and (< pos src-len) (er-digit? (er-cur)))
(er-advance! 1)
(read-integer-digits))))
(define
read-hex-digits
(fn
()
(when
(and (< pos src-len) (er-hex-digit? (er-cur)))
(er-advance! 1)
(read-hex-digits))))
(define
read-number
(fn
(start)
(read-integer-digits)
(cond
(and
(< pos src-len)
(= (er-cur) "#")
(< (+ pos 1) src-len)
(er-hex-digit? (er-peek 1)))
(do (er-advance! 1) (read-hex-digits) {:value (slice src start pos) :type "integer"})
(and
(< pos src-len)
(= (er-cur) ".")
(< (+ pos 1) src-len)
(er-digit? (er-peek 1)))
(do
(er-advance! 1)
(read-integer-digits)
(when
(and
(< pos src-len)
(or (= (er-cur) "e") (= (er-cur) "E")))
(er-advance! 1)
(when
(and
(< pos src-len)
(or (= (er-cur) "+") (= (er-cur) "-")))
(er-advance! 1))
(read-integer-digits))
{:value (slice src start pos) :type "float"})
:else {:value (slice src start pos) :type "integer"})))
(define
read-string
(fn
(quote-char)
(let
((chars (list)))
(er-advance! 1)
(define
loop
(fn
()
(cond
(>= pos src-len)
nil
(= (er-cur) "\\")
(do
(er-advance! 1)
(when
(< pos src-len)
(let
((ch (er-cur)))
(cond
(= ch "n")
(append! chars "\n")
(= ch "t")
(append! chars "\t")
(= ch "r")
(append! chars "\r")
(= ch "\\")
(append! chars "\\")
(= ch "\"")
(append! chars "\"")
(= ch "'")
(append! chars "'")
:else (append! chars ch))
(er-advance! 1)))
(loop))
(= (er-cur) quote-char)
(er-advance! 1)
:else (do (append! chars (er-cur)) (er-advance! 1) (loop)))))
(loop)
(join "" chars))))
(define
er-emit!
(fn
(type value start)
(append! tokens (er-make-token type value start))))
(define
scan!
(fn
()
(skip-ws!)
(when
(< pos src-len)
(let
((ch (er-cur)) (start pos))
(cond
(= ch "%")
(do (skip-comment!) (scan!))
(er-digit? ch)
(do
(let
((tok (read-number start)))
(er-emit! (get tok :type) (get tok :value) start))
(scan!))
(= ch "$")
(do
(er-advance! 1)
;; Emit the char's decimal code as the integer token value
;; (was: raw "$X" text — parse-number then returned nil).
(let
((code (cond
(>= pos src-len) 0
(= (er-cur) "\\")
(do
(er-advance! 1)
(let ((esc (if (< pos src-len) (er-cur) "")))
(when (< pos src-len) (er-advance! 1))
(cond
(= esc "n") 10
(= esc "t") 9
(= esc "r") 13
(= esc "s") 32
(= esc "b") 8
(= esc "e") 27
(= esc "f") 12
(= esc "v") 11
(= esc "d") 127
(= esc "0") 0
(= esc "\\") 92
(= esc "\"") 34
(= esc "'") 39
(= esc "") 0
:else (char->integer (nth (string->list esc) 0)))))
:else
(let ((c (er-cur)))
(er-advance! 1)
(char->integer (nth (string->list c) 0))))))
(er-emit! "integer" (str code) start))
(scan!))
(er-lower? ch)
(do
(let
((word (read-ident-chars start)))
(er-emit!
(if (er-keyword? word) "keyword" "atom")
word
start))
(scan!))
(or (er-upper? ch) (= ch "_"))
(do
(let
((word (read-ident-chars start)))
(er-emit! "var" word start))
(scan!))
(= ch "'")
(do (er-emit! "atom" (read-string "'") start) (scan!))
(= ch "\"")
(do (er-emit! "string" (read-string "\"") start) (scan!))
(and (= ch "<") (= (er-peek 1) "<"))
(do (er-emit! "punct" "<<" start) (er-advance! 2) (scan!))
(and (= ch ">") (= (er-peek 1) ">"))
(do (er-emit! "punct" ">>" start) (er-advance! 2) (scan!))
(and (= ch "-") (= (er-peek 1) ">"))
(do (er-emit! "punct" "->" start) (er-advance! 2) (scan!))
(and (= ch "<") (= (er-peek 1) "-"))
(do (er-emit! "punct" "<-" start) (er-advance! 2) (scan!))
(and (= ch "<") (= (er-peek 1) "="))
(do (er-emit! "punct" "<=" start) (er-advance! 2) (scan!))
(and (= ch "=") (= (er-peek 1) ">"))
(do (er-emit! "punct" "=>" start) (er-advance! 2) (scan!))
(and (= ch "=") (= (er-peek 1) ":") (= (er-peek 2) "="))
(do (er-emit! "op" "=:=" start) (er-advance! 3) (scan!))
(and (= ch "=") (= (er-peek 1) "/") (= (er-peek 2) "="))
(do (er-emit! "op" "=/=" start) (er-advance! 3) (scan!))
(and (= ch "=") (= (er-peek 1) "="))
(do (er-emit! "op" "==" start) (er-advance! 2) (scan!))
(and (= ch "/") (= (er-peek 1) "="))
(do (er-emit! "op" "/=" start) (er-advance! 2) (scan!))
(and (= ch "=") (= (er-peek 1) "<"))
(do (er-emit! "op" "=<" start) (er-advance! 2) (scan!))
(and (= ch ">") (= (er-peek 1) "="))
(do (er-emit! "op" ">=" start) (er-advance! 2) (scan!))
(and (= ch "+") (= (er-peek 1) "+"))
(do (er-emit! "op" "++" start) (er-advance! 2) (scan!))
(and (= ch "-") (= (er-peek 1) "-"))
(do (er-emit! "op" "--" start) (er-advance! 2) (scan!))
(and (= ch ":") (= (er-peek 1) ":"))
(do (er-emit! "punct" "::" start) (er-advance! 2) (scan!))
(and (= ch "|") (= (er-peek 1) "|"))
(do (er-emit! "punct" "||" start) (er-advance! 2) (scan!))
(= ch "(")
(do (er-emit! "punct" "(" start) (er-advance! 1) (scan!))
(= ch ")")
(do (er-emit! "punct" ")" start) (er-advance! 1) (scan!))
(= ch "{")
(do (er-emit! "punct" "{" start) (er-advance! 1) (scan!))
(= ch "}")
(do (er-emit! "punct" "}" start) (er-advance! 1) (scan!))
(= ch "[")
(do (er-emit! "punct" "[" start) (er-advance! 1) (scan!))
(= ch "]")
(do (er-emit! "punct" "]" start) (er-advance! 1) (scan!))
(= ch ",")
(do (er-emit! "punct" "," start) (er-advance! 1) (scan!))
(= ch ";")
(do (er-emit! "punct" ";" start) (er-advance! 1) (scan!))
(= ch ".")
(do (er-emit! "punct" "." start) (er-advance! 1) (scan!))
(= ch ":")
(do (er-emit! "punct" ":" start) (er-advance! 1) (scan!))
(= ch "|")
(do (er-emit! "punct" "|" start) (er-advance! 1) (scan!))
(= ch "+")
(do (er-emit! "op" "+" start) (er-advance! 1) (scan!))
(= ch "-")
(do (er-emit! "op" "-" start) (er-advance! 1) (scan!))
(= ch "*")
(do (er-emit! "op" "*" start) (er-advance! 1) (scan!))
(= ch "/")
(do (er-emit! "op" "/" start) (er-advance! 1) (scan!))
(= ch "=")
(do (er-emit! "op" "=" start) (er-advance! 1) (scan!))
(= ch "<")
(do (er-emit! "op" "<" start) (er-advance! 1) (scan!))
(= ch ">")
(do (er-emit! "op" ">" start) (er-advance! 1) (scan!))
(= ch "!")
(do (er-emit! "op" "!" start) (er-advance! 1) (scan!))
(= ch "?")
(do (er-emit! "op" "?" start) (er-advance! 1) (scan!))
:else (do (er-advance! 1) (scan!)))))))
(scan!)
(er-emit! "eof" nil pos)
tokens)))