Files
rose-ash/lib/erlang/tokenizer.sx
giles 99753580b4 Recover agent-loop progress: lua/prolog/forth/erlang/haskell phases 1-2
Salvaged from worktree-agent-* branches killed during sx-tree MCP outage:
- lua: tokenizer + parser + phase-2 transpile (~157 tests)
- prolog: tokenizer + parser + unification (72 tests, plan update lost to WIP)
- forth: phase-1 reader/interpreter + phase-2 colon/VARIABLE (134 tests)
- erlang: tokenizer + parser (114 tests)
- haskell: tokenizer + parse tests (43 tests)

Cherry-picked file contents only, not branch history, to avoid pulling in
unrelated ocaml-vm merge commits that were in those branches' bases.
2026-04-24 16:03:00 +00:00

335 lines
11 KiB
Plaintext

;; Erlang tokenizer — produces token stream from Erlang source
;;
;; Tokens: {:type T :value V :pos P}
;; Types:
;; "atom" — foo, 'Quoted Atom'
;; "var" — X, Foo, _Bar, _ (wildcard)
;; "integer" — 42, 16#FF, $c (char literal)
;; "float" — 3.14, 1.0e10
;; "string" — "..."
;; "keyword" — case of end if when receive after fun try catch
;; begin do let module export import define andalso orelse
;; not div rem bnot band bor bxor bsl bsr
;; "punct" — ( ) { } [ ] , ; . : :: -> <- <= => | ||
;; << >>
;; "op" — + - * / = == /= =:= =/= < > =< >= ++ -- ! ?
;; "eof"
(define er-make-token (fn (type value pos) {:pos pos :value value :type type}))
(define er-digit? (fn (c) (and (>= c "0") (<= c "9"))))
(define
er-hex-digit?
(fn
(c)
(or
(er-digit? c)
(and (>= c "a") (<= c "f"))
(and (>= c "A") (<= c "F")))))
(define er-lower? (fn (c) (and (>= c "a") (<= c "z"))))
(define er-upper? (fn (c) (and (>= c "A") (<= c "Z"))))
(define er-letter? (fn (c) (or (er-lower? c) (er-upper? c))))
(define
er-ident-char?
(fn (c) (or (er-letter? c) (er-digit? c) (= c "_") (= c "@"))))
(define er-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r"))))
;; Erlang reserved words — everything else starting lowercase is an atom
(define
er-keywords
(list
"after"
"and"
"andalso"
"band"
"begin"
"bnot"
"bor"
"bsl"
"bsr"
"bxor"
"case"
"catch"
"cond"
"div"
"end"
"fun"
"if"
"let"
"not"
"of"
"or"
"orelse"
"receive"
"rem"
"try"
"when"
"xor"))
(define er-keyword? (fn (word) (some (fn (k) (= k word)) er-keywords)))
(define
er-tokenize
(fn
(src)
(let
((tokens (list)) (pos 0) (src-len (len src)))
(define
er-peek
(fn
(offset)
(if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil)))
(define er-cur (fn () (er-peek 0)))
(define er-advance! (fn (n) (set! pos (+ pos n))))
(define
skip-ws!
(fn
()
(when
(and (< pos src-len) (er-ws? (er-cur)))
(er-advance! 1)
(skip-ws!))))
(define
skip-comment!
(fn
()
(when
(and (< pos src-len) (not (= (er-cur) "\n")))
(er-advance! 1)
(skip-comment!))))
(define
read-ident-chars
(fn
(start)
(when
(and (< pos src-len) (er-ident-char? (er-cur)))
(er-advance! 1)
(read-ident-chars start))
(slice src start pos)))
(define
read-integer-digits
(fn
()
(when
(and (< pos src-len) (er-digit? (er-cur)))
(er-advance! 1)
(read-integer-digits))))
(define
read-hex-digits
(fn
()
(when
(and (< pos src-len) (er-hex-digit? (er-cur)))
(er-advance! 1)
(read-hex-digits))))
(define
read-number
(fn
(start)
(read-integer-digits)
(cond
(and
(< pos src-len)
(= (er-cur) "#")
(< (+ pos 1) src-len)
(er-hex-digit? (er-peek 1)))
(do (er-advance! 1) (read-hex-digits) {:value (slice src start pos) :type "integer"})
(and
(< pos src-len)
(= (er-cur) ".")
(< (+ pos 1) src-len)
(er-digit? (er-peek 1)))
(do
(er-advance! 1)
(read-integer-digits)
(when
(and
(< pos src-len)
(or (= (er-cur) "e") (= (er-cur) "E")))
(er-advance! 1)
(when
(and
(< pos src-len)
(or (= (er-cur) "+") (= (er-cur) "-")))
(er-advance! 1))
(read-integer-digits))
{:value (slice src start pos) :type "float"})
:else {:value (slice src start pos) :type "integer"})))
(define
read-string
(fn
(quote-char)
(let
((chars (list)))
(er-advance! 1)
(define
loop
(fn
()
(cond
(>= pos src-len)
nil
(= (er-cur) "\\")
(do
(er-advance! 1)
(when
(< pos src-len)
(let
((ch (er-cur)))
(cond
(= ch "n")
(append! chars "\n")
(= ch "t")
(append! chars "\t")
(= ch "r")
(append! chars "\r")
(= ch "\\")
(append! chars "\\")
(= ch "\"")
(append! chars "\"")
(= ch "'")
(append! chars "'")
:else (append! chars ch))
(er-advance! 1)))
(loop))
(= (er-cur) quote-char)
(er-advance! 1)
:else (do (append! chars (er-cur)) (er-advance! 1) (loop)))))
(loop)
(join "" chars))))
(define
er-emit!
(fn
(type value start)
(append! tokens (er-make-token type value start))))
(define
scan!
(fn
()
(skip-ws!)
(when
(< pos src-len)
(let
((ch (er-cur)) (start pos))
(cond
(= ch "%")
(do (skip-comment!) (scan!))
(er-digit? ch)
(do
(let
((tok (read-number start)))
(er-emit! (get tok :type) (get tok :value) start))
(scan!))
(= ch "$")
(do
(er-advance! 1)
(if
(and (< pos src-len) (= (er-cur) "\\"))
(do
(er-advance! 1)
(when (< pos src-len) (er-advance! 1)))
(when (< pos src-len) (er-advance! 1)))
(er-emit! "integer" (slice src start pos) start)
(scan!))
(er-lower? ch)
(do
(let
((word (read-ident-chars start)))
(er-emit!
(if (er-keyword? word) "keyword" "atom")
word
start))
(scan!))
(or (er-upper? ch) (= ch "_"))
(do
(let
((word (read-ident-chars start)))
(er-emit! "var" word start))
(scan!))
(= ch "'")
(do (er-emit! "atom" (read-string "'") start) (scan!))
(= ch "\"")
(do (er-emit! "string" (read-string "\"") start) (scan!))
(and (= ch "<") (= (er-peek 1) "<"))
(do (er-emit! "punct" "<<" start) (er-advance! 2) (scan!))
(and (= ch ">") (= (er-peek 1) ">"))
(do (er-emit! "punct" ">>" start) (er-advance! 2) (scan!))
(and (= ch "-") (= (er-peek 1) ">"))
(do (er-emit! "punct" "->" start) (er-advance! 2) (scan!))
(and (= ch "<") (= (er-peek 1) "-"))
(do (er-emit! "punct" "<-" start) (er-advance! 2) (scan!))
(and (= ch "<") (= (er-peek 1) "="))
(do (er-emit! "punct" "<=" start) (er-advance! 2) (scan!))
(and (= ch "=") (= (er-peek 1) ">"))
(do (er-emit! "punct" "=>" start) (er-advance! 2) (scan!))
(and (= ch "=") (= (er-peek 1) ":") (= (er-peek 2) "="))
(do (er-emit! "op" "=:=" start) (er-advance! 3) (scan!))
(and (= ch "=") (= (er-peek 1) "/") (= (er-peek 2) "="))
(do (er-emit! "op" "=/=" start) (er-advance! 3) (scan!))
(and (= ch "=") (= (er-peek 1) "="))
(do (er-emit! "op" "==" start) (er-advance! 2) (scan!))
(and (= ch "/") (= (er-peek 1) "="))
(do (er-emit! "op" "/=" start) (er-advance! 2) (scan!))
(and (= ch "=") (= (er-peek 1) "<"))
(do (er-emit! "op" "=<" start) (er-advance! 2) (scan!))
(and (= ch ">") (= (er-peek 1) "="))
(do (er-emit! "op" ">=" start) (er-advance! 2) (scan!))
(and (= ch "+") (= (er-peek 1) "+"))
(do (er-emit! "op" "++" start) (er-advance! 2) (scan!))
(and (= ch "-") (= (er-peek 1) "-"))
(do (er-emit! "op" "--" start) (er-advance! 2) (scan!))
(and (= ch ":") (= (er-peek 1) ":"))
(do (er-emit! "punct" "::" start) (er-advance! 2) (scan!))
(and (= ch "|") (= (er-peek 1) "|"))
(do (er-emit! "punct" "||" start) (er-advance! 2) (scan!))
(= ch "(")
(do (er-emit! "punct" "(" start) (er-advance! 1) (scan!))
(= ch ")")
(do (er-emit! "punct" ")" start) (er-advance! 1) (scan!))
(= ch "{")
(do (er-emit! "punct" "{" start) (er-advance! 1) (scan!))
(= ch "}")
(do (er-emit! "punct" "}" start) (er-advance! 1) (scan!))
(= ch "[")
(do (er-emit! "punct" "[" start) (er-advance! 1) (scan!))
(= ch "]")
(do (er-emit! "punct" "]" start) (er-advance! 1) (scan!))
(= ch ",")
(do (er-emit! "punct" "," start) (er-advance! 1) (scan!))
(= ch ";")
(do (er-emit! "punct" ";" start) (er-advance! 1) (scan!))
(= ch ".")
(do (er-emit! "punct" "." start) (er-advance! 1) (scan!))
(= ch ":")
(do (er-emit! "punct" ":" start) (er-advance! 1) (scan!))
(= ch "|")
(do (er-emit! "punct" "|" start) (er-advance! 1) (scan!))
(= ch "+")
(do (er-emit! "op" "+" start) (er-advance! 1) (scan!))
(= ch "-")
(do (er-emit! "op" "-" start) (er-advance! 1) (scan!))
(= ch "*")
(do (er-emit! "op" "*" start) (er-advance! 1) (scan!))
(= ch "/")
(do (er-emit! "op" "/" start) (er-advance! 1) (scan!))
(= ch "=")
(do (er-emit! "op" "=" start) (er-advance! 1) (scan!))
(= ch "<")
(do (er-emit! "op" "<" start) (er-advance! 1) (scan!))
(= ch ">")
(do (er-emit! "op" ">" start) (er-advance! 1) (scan!))
(= ch "!")
(do (er-emit! "op" "!" start) (er-advance! 1) (scan!))
(= ch "?")
(do (er-emit! "op" "?" start) (er-advance! 1) (scan!))
:else (do (er-advance! 1) (scan!)))))))
(scan!)
(er-emit! "eof" nil pos)
tokens)))