Salvaged from worktree-agent-* branches killed during sx-tree MCP outage: - lua: tokenizer + parser + phase-2 transpile (~157 tests) - prolog: tokenizer + parser + unification (72 tests, plan update lost to WIP) - forth: phase-1 reader/interpreter + phase-2 colon/VARIABLE (134 tests) - erlang: tokenizer + parser (114 tests) - haskell: tokenizer + parse tests (43 tests) Cherry-picked file contents only, not branch history, to avoid pulling in unrelated ocaml-vm merge commits that were in those branches' bases.
335 lines
11 KiB
Plaintext
335 lines
11 KiB
Plaintext
;; Erlang tokenizer — produces token stream from Erlang source
|
|
;;
|
|
;; Tokens: {:type T :value V :pos P}
|
|
;; Types:
|
|
;; "atom" — foo, 'Quoted Atom'
|
|
;; "var" — X, Foo, _Bar, _ (wildcard)
|
|
;; "integer" — 42, 16#FF, $c (char literal)
|
|
;; "float" — 3.14, 1.0e10
|
|
;; "string" — "..."
|
|
;; "keyword" — case of end if when receive after fun try catch
|
|
;; begin do let module export import define andalso orelse
|
|
;; not div rem bnot band bor bxor bsl bsr
|
|
;; "punct" — ( ) { } [ ] , ; . : :: -> <- <= => | ||
|
|
;; << >>
|
|
;; "op" — + - * / = == /= =:= =/= < > =< >= ++ -- ! ?
|
|
;; "eof"
|
|
|
|
(define er-make-token (fn (type value pos) {:pos pos :value value :type type}))
|
|
|
|
(define er-digit? (fn (c) (and (>= c "0") (<= c "9"))))
|
|
|
|
(define
|
|
er-hex-digit?
|
|
(fn
|
|
(c)
|
|
(or
|
|
(er-digit? c)
|
|
(and (>= c "a") (<= c "f"))
|
|
(and (>= c "A") (<= c "F")))))
|
|
|
|
(define er-lower? (fn (c) (and (>= c "a") (<= c "z"))))
|
|
|
|
(define er-upper? (fn (c) (and (>= c "A") (<= c "Z"))))
|
|
|
|
(define er-letter? (fn (c) (or (er-lower? c) (er-upper? c))))
|
|
|
|
(define
|
|
er-ident-char?
|
|
(fn (c) (or (er-letter? c) (er-digit? c) (= c "_") (= c "@"))))
|
|
|
|
(define er-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r"))))
|
|
|
|
;; Erlang reserved words — everything else starting lowercase is an atom
|
|
(define
|
|
er-keywords
|
|
(list
|
|
"after"
|
|
"and"
|
|
"andalso"
|
|
"band"
|
|
"begin"
|
|
"bnot"
|
|
"bor"
|
|
"bsl"
|
|
"bsr"
|
|
"bxor"
|
|
"case"
|
|
"catch"
|
|
"cond"
|
|
"div"
|
|
"end"
|
|
"fun"
|
|
"if"
|
|
"let"
|
|
"not"
|
|
"of"
|
|
"or"
|
|
"orelse"
|
|
"receive"
|
|
"rem"
|
|
"try"
|
|
"when"
|
|
"xor"))
|
|
|
|
(define er-keyword? (fn (word) (some (fn (k) (= k word)) er-keywords)))
|
|
|
|
(define
|
|
er-tokenize
|
|
(fn
|
|
(src)
|
|
(let
|
|
((tokens (list)) (pos 0) (src-len (len src)))
|
|
(define
|
|
er-peek
|
|
(fn
|
|
(offset)
|
|
(if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil)))
|
|
(define er-cur (fn () (er-peek 0)))
|
|
(define er-advance! (fn (n) (set! pos (+ pos n))))
|
|
(define
|
|
skip-ws!
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< pos src-len) (er-ws? (er-cur)))
|
|
(er-advance! 1)
|
|
(skip-ws!))))
|
|
(define
|
|
skip-comment!
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< pos src-len) (not (= (er-cur) "\n")))
|
|
(er-advance! 1)
|
|
(skip-comment!))))
|
|
(define
|
|
read-ident-chars
|
|
(fn
|
|
(start)
|
|
(when
|
|
(and (< pos src-len) (er-ident-char? (er-cur)))
|
|
(er-advance! 1)
|
|
(read-ident-chars start))
|
|
(slice src start pos)))
|
|
(define
|
|
read-integer-digits
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< pos src-len) (er-digit? (er-cur)))
|
|
(er-advance! 1)
|
|
(read-integer-digits))))
|
|
(define
|
|
read-hex-digits
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< pos src-len) (er-hex-digit? (er-cur)))
|
|
(er-advance! 1)
|
|
(read-hex-digits))))
|
|
(define
|
|
read-number
|
|
(fn
|
|
(start)
|
|
(read-integer-digits)
|
|
(cond
|
|
(and
|
|
(< pos src-len)
|
|
(= (er-cur) "#")
|
|
(< (+ pos 1) src-len)
|
|
(er-hex-digit? (er-peek 1)))
|
|
(do (er-advance! 1) (read-hex-digits) {:value (slice src start pos) :type "integer"})
|
|
(and
|
|
(< pos src-len)
|
|
(= (er-cur) ".")
|
|
(< (+ pos 1) src-len)
|
|
(er-digit? (er-peek 1)))
|
|
(do
|
|
(er-advance! 1)
|
|
(read-integer-digits)
|
|
(when
|
|
(and
|
|
(< pos src-len)
|
|
(or (= (er-cur) "e") (= (er-cur) "E")))
|
|
(er-advance! 1)
|
|
(when
|
|
(and
|
|
(< pos src-len)
|
|
(or (= (er-cur) "+") (= (er-cur) "-")))
|
|
(er-advance! 1))
|
|
(read-integer-digits))
|
|
{:value (slice src start pos) :type "float"})
|
|
:else {:value (slice src start pos) :type "integer"})))
|
|
(define
|
|
read-string
|
|
(fn
|
|
(quote-char)
|
|
(let
|
|
((chars (list)))
|
|
(er-advance! 1)
|
|
(define
|
|
loop
|
|
(fn
|
|
()
|
|
(cond
|
|
(>= pos src-len)
|
|
nil
|
|
(= (er-cur) "\\")
|
|
(do
|
|
(er-advance! 1)
|
|
(when
|
|
(< pos src-len)
|
|
(let
|
|
((ch (er-cur)))
|
|
(cond
|
|
(= ch "n")
|
|
(append! chars "\n")
|
|
(= ch "t")
|
|
(append! chars "\t")
|
|
(= ch "r")
|
|
(append! chars "\r")
|
|
(= ch "\\")
|
|
(append! chars "\\")
|
|
(= ch "\"")
|
|
(append! chars "\"")
|
|
(= ch "'")
|
|
(append! chars "'")
|
|
:else (append! chars ch))
|
|
(er-advance! 1)))
|
|
(loop))
|
|
(= (er-cur) quote-char)
|
|
(er-advance! 1)
|
|
:else (do (append! chars (er-cur)) (er-advance! 1) (loop)))))
|
|
(loop)
|
|
(join "" chars))))
|
|
(define
|
|
er-emit!
|
|
(fn
|
|
(type value start)
|
|
(append! tokens (er-make-token type value start))))
|
|
(define
|
|
scan!
|
|
(fn
|
|
()
|
|
(skip-ws!)
|
|
(when
|
|
(< pos src-len)
|
|
(let
|
|
((ch (er-cur)) (start pos))
|
|
(cond
|
|
(= ch "%")
|
|
(do (skip-comment!) (scan!))
|
|
(er-digit? ch)
|
|
(do
|
|
(let
|
|
((tok (read-number start)))
|
|
(er-emit! (get tok :type) (get tok :value) start))
|
|
(scan!))
|
|
(= ch "$")
|
|
(do
|
|
(er-advance! 1)
|
|
(if
|
|
(and (< pos src-len) (= (er-cur) "\\"))
|
|
(do
|
|
(er-advance! 1)
|
|
(when (< pos src-len) (er-advance! 1)))
|
|
(when (< pos src-len) (er-advance! 1)))
|
|
(er-emit! "integer" (slice src start pos) start)
|
|
(scan!))
|
|
(er-lower? ch)
|
|
(do
|
|
(let
|
|
((word (read-ident-chars start)))
|
|
(er-emit!
|
|
(if (er-keyword? word) "keyword" "atom")
|
|
word
|
|
start))
|
|
(scan!))
|
|
(or (er-upper? ch) (= ch "_"))
|
|
(do
|
|
(let
|
|
((word (read-ident-chars start)))
|
|
(er-emit! "var" word start))
|
|
(scan!))
|
|
(= ch "'")
|
|
(do (er-emit! "atom" (read-string "'") start) (scan!))
|
|
(= ch "\"")
|
|
(do (er-emit! "string" (read-string "\"") start) (scan!))
|
|
(and (= ch "<") (= (er-peek 1) "<"))
|
|
(do (er-emit! "punct" "<<" start) (er-advance! 2) (scan!))
|
|
(and (= ch ">") (= (er-peek 1) ">"))
|
|
(do (er-emit! "punct" ">>" start) (er-advance! 2) (scan!))
|
|
(and (= ch "-") (= (er-peek 1) ">"))
|
|
(do (er-emit! "punct" "->" start) (er-advance! 2) (scan!))
|
|
(and (= ch "<") (= (er-peek 1) "-"))
|
|
(do (er-emit! "punct" "<-" start) (er-advance! 2) (scan!))
|
|
(and (= ch "<") (= (er-peek 1) "="))
|
|
(do (er-emit! "punct" "<=" start) (er-advance! 2) (scan!))
|
|
(and (= ch "=") (= (er-peek 1) ">"))
|
|
(do (er-emit! "punct" "=>" start) (er-advance! 2) (scan!))
|
|
(and (= ch "=") (= (er-peek 1) ":") (= (er-peek 2) "="))
|
|
(do (er-emit! "op" "=:=" start) (er-advance! 3) (scan!))
|
|
(and (= ch "=") (= (er-peek 1) "/") (= (er-peek 2) "="))
|
|
(do (er-emit! "op" "=/=" start) (er-advance! 3) (scan!))
|
|
(and (= ch "=") (= (er-peek 1) "="))
|
|
(do (er-emit! "op" "==" start) (er-advance! 2) (scan!))
|
|
(and (= ch "/") (= (er-peek 1) "="))
|
|
(do (er-emit! "op" "/=" start) (er-advance! 2) (scan!))
|
|
(and (= ch "=") (= (er-peek 1) "<"))
|
|
(do (er-emit! "op" "=<" start) (er-advance! 2) (scan!))
|
|
(and (= ch ">") (= (er-peek 1) "="))
|
|
(do (er-emit! "op" ">=" start) (er-advance! 2) (scan!))
|
|
(and (= ch "+") (= (er-peek 1) "+"))
|
|
(do (er-emit! "op" "++" start) (er-advance! 2) (scan!))
|
|
(and (= ch "-") (= (er-peek 1) "-"))
|
|
(do (er-emit! "op" "--" start) (er-advance! 2) (scan!))
|
|
(and (= ch ":") (= (er-peek 1) ":"))
|
|
(do (er-emit! "punct" "::" start) (er-advance! 2) (scan!))
|
|
(and (= ch "|") (= (er-peek 1) "|"))
|
|
(do (er-emit! "punct" "||" start) (er-advance! 2) (scan!))
|
|
(= ch "(")
|
|
(do (er-emit! "punct" "(" start) (er-advance! 1) (scan!))
|
|
(= ch ")")
|
|
(do (er-emit! "punct" ")" start) (er-advance! 1) (scan!))
|
|
(= ch "{")
|
|
(do (er-emit! "punct" "{" start) (er-advance! 1) (scan!))
|
|
(= ch "}")
|
|
(do (er-emit! "punct" "}" start) (er-advance! 1) (scan!))
|
|
(= ch "[")
|
|
(do (er-emit! "punct" "[" start) (er-advance! 1) (scan!))
|
|
(= ch "]")
|
|
(do (er-emit! "punct" "]" start) (er-advance! 1) (scan!))
|
|
(= ch ",")
|
|
(do (er-emit! "punct" "," start) (er-advance! 1) (scan!))
|
|
(= ch ";")
|
|
(do (er-emit! "punct" ";" start) (er-advance! 1) (scan!))
|
|
(= ch ".")
|
|
(do (er-emit! "punct" "." start) (er-advance! 1) (scan!))
|
|
(= ch ":")
|
|
(do (er-emit! "punct" ":" start) (er-advance! 1) (scan!))
|
|
(= ch "|")
|
|
(do (er-emit! "punct" "|" start) (er-advance! 1) (scan!))
|
|
(= ch "+")
|
|
(do (er-emit! "op" "+" start) (er-advance! 1) (scan!))
|
|
(= ch "-")
|
|
(do (er-emit! "op" "-" start) (er-advance! 1) (scan!))
|
|
(= ch "*")
|
|
(do (er-emit! "op" "*" start) (er-advance! 1) (scan!))
|
|
(= ch "/")
|
|
(do (er-emit! "op" "/" start) (er-advance! 1) (scan!))
|
|
(= ch "=")
|
|
(do (er-emit! "op" "=" start) (er-advance! 1) (scan!))
|
|
(= ch "<")
|
|
(do (er-emit! "op" "<" start) (er-advance! 1) (scan!))
|
|
(= ch ">")
|
|
(do (er-emit! "op" ">" start) (er-advance! 1) (scan!))
|
|
(= ch "!")
|
|
(do (er-emit! "op" "!" start) (er-advance! 1) (scan!))
|
|
(= ch "?")
|
|
(do (er-emit! "op" "?" start) (er-advance! 1) (scan!))
|
|
:else (do (er-advance! 1) (scan!)))))))
|
|
(scan!)
|
|
(er-emit! "eof" nil pos)
|
|
tokens)))
|