;; Erlang tokenizer — produces token stream from Erlang source ;; ;; Tokens: {:type T :value V :pos P} ;; Types: ;; "atom" — foo, 'Quoted Atom' ;; "var" — X, Foo, _Bar, _ (wildcard) ;; "integer" — 42, 16#FF, $c (char literal) ;; "float" — 3.14, 1.0e10 ;; "string" — "..." ;; "keyword" — case of end if when receive after fun try catch ;; begin do let module export import define andalso orelse ;; not div rem bnot band bor bxor bsl bsr ;; "punct" — ( ) { } [ ] , ; . : :: -> <- <= => | || ;; << >> ;; "op" — + - * / = == /= =:= =/= < > =< >= ++ -- ! ? ;; "eof" (define er-make-token (fn (type value pos) {:pos pos :value value :type type})) (define er-digit? (fn (c) (and (>= c "0") (<= c "9")))) (define er-hex-digit? (fn (c) (or (er-digit? c) (and (>= c "a") (<= c "f")) (and (>= c "A") (<= c "F"))))) (define er-lower? (fn (c) (and (>= c "a") (<= c "z")))) (define er-upper? (fn (c) (and (>= c "A") (<= c "Z")))) (define er-letter? (fn (c) (or (er-lower? c) (er-upper? c)))) (define er-ident-char? (fn (c) (or (er-letter? c) (er-digit? c) (= c "_") (= c "@")))) (define er-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r")))) ;; Erlang reserved words — everything else starting lowercase is an atom (define er-keywords (list "after" "and" "andalso" "band" "begin" "bnot" "bor" "bsl" "bsr" "bxor" "case" "catch" "cond" "div" "end" "fun" "if" "let" "not" "of" "or" "orelse" "receive" "rem" "try" "when" "xor")) (define er-keyword? (fn (word) (some (fn (k) (= k word)) er-keywords))) (define er-tokenize (fn (src) (let ((tokens (list)) (pos 0) (src-len (len src))) (define er-peek (fn (offset) (if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil))) (define er-cur (fn () (er-peek 0))) (define er-advance! (fn (n) (set! pos (+ pos n)))) (define skip-ws! (fn () (when (and (< pos src-len) (er-ws? (er-cur))) (er-advance! 1) (skip-ws!)))) (define skip-comment! (fn () (when (and (< pos src-len) (not (= (er-cur) "\n"))) (er-advance! 1) (skip-comment!)))) (define read-ident-chars (fn (start) (when (and (< pos src-len) (er-ident-char? (er-cur))) (er-advance! 1) (read-ident-chars start)) (slice src start pos))) (define read-integer-digits (fn () (when (and (< pos src-len) (er-digit? (er-cur))) (er-advance! 1) (read-integer-digits)))) (define read-hex-digits (fn () (when (and (< pos src-len) (er-hex-digit? (er-cur))) (er-advance! 1) (read-hex-digits)))) (define read-number (fn (start) (read-integer-digits) (cond (and (< pos src-len) (= (er-cur) "#") (< (+ pos 1) src-len) (er-hex-digit? (er-peek 1))) (do (er-advance! 1) (read-hex-digits) {:value (slice src start pos) :type "integer"}) (and (< pos src-len) (= (er-cur) ".") (< (+ pos 1) src-len) (er-digit? (er-peek 1))) (do (er-advance! 1) (read-integer-digits) (when (and (< pos src-len) (or (= (er-cur) "e") (= (er-cur) "E"))) (er-advance! 1) (when (and (< pos src-len) (or (= (er-cur) "+") (= (er-cur) "-"))) (er-advance! 1)) (read-integer-digits)) {:value (slice src start pos) :type "float"}) :else {:value (slice src start pos) :type "integer"}))) (define read-string (fn (quote-char) (let ((chars (list))) (er-advance! 1) (define loop (fn () (cond (>= pos src-len) nil (= (er-cur) "\\") (do (er-advance! 1) (when (< pos src-len) (let ((ch (er-cur))) (cond (= ch "n") (append! chars "\n") (= ch "t") (append! chars "\t") (= ch "r") (append! chars "\r") (= ch "\\") (append! chars "\\") (= ch "\"") (append! chars "\"") (= ch "'") (append! chars "'") :else (append! chars ch)) (er-advance! 1))) (loop)) (= (er-cur) quote-char) (er-advance! 1) :else (do (append! chars (er-cur)) (er-advance! 1) (loop))))) (loop) (join "" chars)))) (define er-emit! (fn (type value start) (append! tokens (er-make-token type value start)))) (define scan! (fn () (skip-ws!) (when (< pos src-len) (let ((ch (er-cur)) (start pos)) (cond (= ch "%") (do (skip-comment!) (scan!)) (er-digit? ch) (do (let ((tok (read-number start))) (er-emit! (get tok :type) (get tok :value) start)) (scan!)) (= ch "$") (do (er-advance! 1) (if (and (< pos src-len) (= (er-cur) "\\")) (do (er-advance! 1) (when (< pos src-len) (er-advance! 1))) (when (< pos src-len) (er-advance! 1))) (er-emit! "integer" (slice src start pos) start) (scan!)) (er-lower? ch) (do (let ((word (read-ident-chars start))) (er-emit! (if (er-keyword? word) "keyword" "atom") word start)) (scan!)) (or (er-upper? ch) (= ch "_")) (do (let ((word (read-ident-chars start))) (er-emit! "var" word start)) (scan!)) (= ch "'") (do (er-emit! "atom" (read-string "'") start) (scan!)) (= ch "\"") (do (er-emit! "string" (read-string "\"") start) (scan!)) (and (= ch "<") (= (er-peek 1) "<")) (do (er-emit! "punct" "<<" start) (er-advance! 2) (scan!)) (and (= ch ">") (= (er-peek 1) ">")) (do (er-emit! "punct" ">>" start) (er-advance! 2) (scan!)) (and (= ch "-") (= (er-peek 1) ">")) (do (er-emit! "punct" "->" start) (er-advance! 2) (scan!)) (and (= ch "<") (= (er-peek 1) "-")) (do (er-emit! "punct" "<-" start) (er-advance! 2) (scan!)) (and (= ch "<") (= (er-peek 1) "=")) (do (er-emit! "punct" "<=" start) (er-advance! 2) (scan!)) (and (= ch "=") (= (er-peek 1) ">")) (do (er-emit! "punct" "=>" start) (er-advance! 2) (scan!)) (and (= ch "=") (= (er-peek 1) ":") (= (er-peek 2) "=")) (do (er-emit! "op" "=:=" start) (er-advance! 3) (scan!)) (and (= ch "=") (= (er-peek 1) "/") (= (er-peek 2) "=")) (do (er-emit! "op" "=/=" start) (er-advance! 3) (scan!)) (and (= ch "=") (= (er-peek 1) "=")) (do (er-emit! "op" "==" start) (er-advance! 2) (scan!)) (and (= ch "/") (= (er-peek 1) "=")) (do (er-emit! "op" "/=" start) (er-advance! 2) (scan!)) (and (= ch "=") (= (er-peek 1) "<")) (do (er-emit! "op" "=<" start) (er-advance! 2) (scan!)) (and (= ch ">") (= (er-peek 1) "=")) (do (er-emit! "op" ">=" start) (er-advance! 2) (scan!)) (and (= ch "+") (= (er-peek 1) "+")) (do (er-emit! "op" "++" start) (er-advance! 2) (scan!)) (and (= ch "-") (= (er-peek 1) "-")) (do (er-emit! "op" "--" start) (er-advance! 2) (scan!)) (and (= ch ":") (= (er-peek 1) ":")) (do (er-emit! "punct" "::" start) (er-advance! 2) (scan!)) (and (= ch "|") (= (er-peek 1) "|")) (do (er-emit! "punct" "||" start) (er-advance! 2) (scan!)) (= ch "(") (do (er-emit! "punct" "(" start) (er-advance! 1) (scan!)) (= ch ")") (do (er-emit! "punct" ")" start) (er-advance! 1) (scan!)) (= ch "{") (do (er-emit! "punct" "{" start) (er-advance! 1) (scan!)) (= ch "}") (do (er-emit! "punct" "}" start) (er-advance! 1) (scan!)) (= ch "[") (do (er-emit! "punct" "[" start) (er-advance! 1) (scan!)) (= ch "]") (do (er-emit! "punct" "]" start) (er-advance! 1) (scan!)) (= ch ",") (do (er-emit! "punct" "," start) (er-advance! 1) (scan!)) (= ch ";") (do (er-emit! "punct" ";" start) (er-advance! 1) (scan!)) (= ch ".") (do (er-emit! "punct" "." start) (er-advance! 1) (scan!)) (= ch ":") (do (er-emit! "punct" ":" start) (er-advance! 1) (scan!)) (= ch "|") (do (er-emit! "punct" "|" start) (er-advance! 1) (scan!)) (= ch "+") (do (er-emit! "op" "+" start) (er-advance! 1) (scan!)) (= ch "-") (do (er-emit! "op" "-" start) (er-advance! 1) (scan!)) (= ch "*") (do (er-emit! "op" "*" start) (er-advance! 1) (scan!)) (= ch "/") (do (er-emit! "op" "/" start) (er-advance! 1) (scan!)) (= ch "=") (do (er-emit! "op" "=" start) (er-advance! 1) (scan!)) (= ch "<") (do (er-emit! "op" "<" start) (er-advance! 1) (scan!)) (= ch ">") (do (er-emit! "op" ">" start) (er-advance! 1) (scan!)) (= ch "!") (do (er-emit! "op" "!" start) (er-advance! 1) (scan!)) (= ch "?") (do (er-emit! "op" "?" start) (er-advance! 1) (scan!)) :else (do (er-advance! 1) (scan!))))))) (scan!) (er-emit! "eof" nil pos) tokens)))