Lexer: js-regex-context? disambiguates / based on prior token;
read-regex handles [...] classes and \ escapes. Emits
{:type "regex" :value {:pattern :flags}}.
Parser: new primary branch → (js-regex pat flags).
Transpile: (js-regex-new pat flags).
Runtime: js-regex? predicate, js-regex-new builds tagged dict with
source/flags/global/ignoreCase/multiline/sticky/unicode/dotAll/
hasIndices/lastIndex. js-regex-invoke-method dispatches .test/.exec/
.toString. js-invoke-method detects regex receivers. Stub engine
uses js-string-index-of; __js_regex_platform__ + override! let a
real engine plug in later.
Runner: repeatable --filter flags (OR'd).
308/310 unit (+30 regex tests), 148/148 slice unchanged.
610 lines
22 KiB
Plaintext
610 lines
22 KiB
Plaintext
;; lib/js/lexer.sx — JavaScript source → token stream
|
|
;;
|
|
;; Tokens: {:type T :value V :pos P}
|
|
;; Types:
|
|
;; "number" — numeric literals (decoded into value as number)
|
|
;; "string" — string literals (decoded, escape sequences processed)
|
|
;; "template"— template literal body (no interpolation split yet — deferred)
|
|
;; "ident" — identifier (not a reserved word)
|
|
;; "keyword" — reserved word
|
|
;; "punct" — ( ) [ ] { } , ; : . ...
|
|
;; "op" — all operator tokens (incl. = == === !== < > etc.)
|
|
;; "eof" — end of input
|
|
;;
|
|
;; NOTE: `cond` clauses take exactly ONE body expression — multi-body
|
|
;; clauses must wrap their body in `(do ...)`.
|
|
|
|
;; ── Token constructor ─────────────────────────────────────────────
|
|
(define js-make-token (fn (type value pos) {:pos pos :value value :type type}))
|
|
|
|
;; ── Character predicates ──────────────────────────────────────────
|
|
(define js-digit? (fn (c) (and (>= c "0") (<= c "9"))))
|
|
|
|
(define
|
|
js-hex-digit?
|
|
(fn
|
|
(c)
|
|
(or
|
|
(js-digit? c)
|
|
(and (>= c "a") (<= c "f"))
|
|
(and (>= c "A") (<= c "F")))))
|
|
|
|
(define
|
|
js-letter?
|
|
(fn (c) (or (and (>= c "a") (<= c "z")) (and (>= c "A") (<= c "Z")))))
|
|
|
|
(define js-ident-start? (fn (c) (or (js-letter? c) (= c "_") (= c "$"))))
|
|
|
|
(define js-ident-char? (fn (c) (or (js-ident-start? c) (js-digit? c))))
|
|
|
|
(define js-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r"))))
|
|
|
|
;; ── Reserved words ────────────────────────────────────────────────
|
|
(define
|
|
js-keywords
|
|
(list
|
|
"break"
|
|
"case"
|
|
"catch"
|
|
"class"
|
|
"const"
|
|
"continue"
|
|
"debugger"
|
|
"default"
|
|
"delete"
|
|
"do"
|
|
"else"
|
|
"export"
|
|
"extends"
|
|
"false"
|
|
"finally"
|
|
"for"
|
|
"function"
|
|
"if"
|
|
"import"
|
|
"in"
|
|
"instanceof"
|
|
"new"
|
|
"null"
|
|
"return"
|
|
"super"
|
|
"switch"
|
|
"this"
|
|
"throw"
|
|
"true"
|
|
"try"
|
|
"typeof"
|
|
"undefined"
|
|
"var"
|
|
"void"
|
|
"while"
|
|
"with"
|
|
"yield"
|
|
"let"
|
|
"static"
|
|
"async"
|
|
"await"
|
|
"of"))
|
|
|
|
(define js-keyword? (fn (word) (contains? js-keywords word)))
|
|
|
|
;; ── Main tokenizer ────────────────────────────────────────────────
|
|
(define
|
|
js-tokenize
|
|
(fn
|
|
(src)
|
|
(let
|
|
((tokens (list)) (pos 0) (src-len (len src)))
|
|
(define
|
|
js-peek
|
|
(fn
|
|
(offset)
|
|
(if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil)))
|
|
(define cur (fn () (js-peek 0)))
|
|
(define advance! (fn (n) (set! pos (+ pos n))))
|
|
(define
|
|
at?
|
|
(fn
|
|
(s)
|
|
(let
|
|
((sl (len s)))
|
|
(and (<= (+ pos sl) src-len) (= (slice src pos (+ pos sl)) s)))))
|
|
(define
|
|
js-emit!
|
|
(fn
|
|
(type value start)
|
|
(append! tokens (js-make-token type value start))))
|
|
(define
|
|
skip-line-comment!
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< pos src-len) (not (= (cur) "\n")))
|
|
(do (advance! 1) (skip-line-comment!)))))
|
|
(define
|
|
skip-block-comment!
|
|
(fn
|
|
()
|
|
(cond
|
|
((>= pos src-len) nil)
|
|
((and (= (cur) "*") (< (+ pos 1) src-len) (= (js-peek 1) "/"))
|
|
(advance! 2))
|
|
(else (do (advance! 1) (skip-block-comment!))))))
|
|
(define
|
|
skip-ws!
|
|
(fn
|
|
()
|
|
(cond
|
|
((>= pos src-len) nil)
|
|
((js-ws? (cur)) (do (advance! 1) (skip-ws!)))
|
|
((and (= (cur) "/") (< (+ pos 1) src-len) (= (js-peek 1) "/"))
|
|
(do (advance! 2) (skip-line-comment!) (skip-ws!)))
|
|
((and (= (cur) "/") (< (+ pos 1) src-len) (= (js-peek 1) "*"))
|
|
(do (advance! 2) (skip-block-comment!) (skip-ws!)))
|
|
(else nil))))
|
|
(define
|
|
read-ident
|
|
(fn
|
|
(start)
|
|
(do
|
|
(when
|
|
(and (< pos src-len) (js-ident-char? (cur)))
|
|
(do (advance! 1) (read-ident start)))
|
|
(slice src start pos))))
|
|
(define
|
|
read-decimal-digits!
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< pos src-len) (js-digit? (cur)))
|
|
(do (advance! 1) (read-decimal-digits!)))))
|
|
(define
|
|
read-hex-digits!
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< pos src-len) (js-hex-digit? (cur)))
|
|
(do (advance! 1) (read-hex-digits!)))))
|
|
(define
|
|
read-exp-part!
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< pos src-len) (or (= (cur) "e") (= (cur) "E")))
|
|
(let
|
|
((p1 (js-peek 1)))
|
|
(when
|
|
(or
|
|
(and (not (= p1 nil)) (js-digit? p1))
|
|
(and
|
|
(or (= p1 "+") (= p1 "-"))
|
|
(< (+ pos 2) src-len)
|
|
(js-digit? (js-peek 2))))
|
|
(do
|
|
(advance! 1)
|
|
(when
|
|
(and
|
|
(< pos src-len)
|
|
(or (= (cur) "+") (= (cur) "-")))
|
|
(advance! 1))
|
|
(read-decimal-digits!)))))))
|
|
(define
|
|
read-number
|
|
(fn
|
|
(start)
|
|
(cond
|
|
((and (= (cur) "0") (< (+ pos 1) src-len) (or (= (js-peek 1) "x") (= (js-peek 1) "X")))
|
|
(do
|
|
(advance! 2)
|
|
(read-hex-digits!)
|
|
(let
|
|
((raw (slice src (+ start 2) pos)))
|
|
(parse-number (str "0x" raw)))))
|
|
(else
|
|
(do
|
|
(read-decimal-digits!)
|
|
(when
|
|
(and
|
|
(< pos src-len)
|
|
(= (cur) ".")
|
|
(< (+ pos 1) src-len)
|
|
(js-digit? (js-peek 1)))
|
|
(do (advance! 1) (read-decimal-digits!)))
|
|
(read-exp-part!)
|
|
(parse-number (slice src start pos)))))))
|
|
(define
|
|
read-dot-number
|
|
(fn
|
|
(start)
|
|
(do
|
|
(advance! 1)
|
|
(read-decimal-digits!)
|
|
(read-exp-part!)
|
|
(parse-number (slice src start pos)))))
|
|
(define
|
|
read-string
|
|
(fn
|
|
(quote-char)
|
|
(let
|
|
((chars (list)))
|
|
(advance! 1)
|
|
(define
|
|
loop
|
|
(fn
|
|
()
|
|
(cond
|
|
((>= pos src-len) nil)
|
|
((= (cur) "\\")
|
|
(do
|
|
(advance! 1)
|
|
(when
|
|
(< pos src-len)
|
|
(let
|
|
((ch (cur)))
|
|
(do
|
|
(cond
|
|
((= ch "n") (append! chars "\n"))
|
|
((= ch "t") (append! chars "\t"))
|
|
((= ch "r") (append! chars "\r"))
|
|
((= ch "\\") (append! chars "\\"))
|
|
((= ch "'") (append! chars "'"))
|
|
((= ch "\"") (append! chars "\""))
|
|
((= ch "`") (append! chars "`"))
|
|
((= ch "0") (append! chars "\\0"))
|
|
((= ch "b") (append! chars "\\b"))
|
|
((= ch "f") (append! chars "\\f"))
|
|
((= ch "v") (append! chars "\\v"))
|
|
(else (append! chars ch)))
|
|
(advance! 1))))
|
|
(loop)))
|
|
((= (cur) quote-char) (advance! 1))
|
|
(else (do (append! chars (cur)) (advance! 1) (loop))))))
|
|
(loop)
|
|
(join "" chars))))
|
|
(define
|
|
read-template
|
|
(fn
|
|
()
|
|
(let
|
|
((parts (list)) (chars (list)))
|
|
(advance! 1)
|
|
(define
|
|
flush-chars!
|
|
(fn
|
|
()
|
|
(when
|
|
(> (len chars) 0)
|
|
(do
|
|
(append! parts (list "str" (join "" chars)))
|
|
(set! chars (list))))))
|
|
(define
|
|
read-expr-source!
|
|
(fn
|
|
()
|
|
(let
|
|
((buf (list)) (depth 1))
|
|
(define
|
|
expr-loop
|
|
(fn
|
|
()
|
|
(cond
|
|
((>= pos src-len) nil)
|
|
((and (= (cur) "}") (= depth 1)) (advance! 1))
|
|
((= (cur) "}")
|
|
(do
|
|
(append! buf (cur))
|
|
(set! depth (- depth 1))
|
|
(advance! 1)
|
|
(expr-loop)))
|
|
((= (cur) "{")
|
|
(do
|
|
(append! buf (cur))
|
|
(set! depth (+ depth 1))
|
|
(advance! 1)
|
|
(expr-loop)))
|
|
((or (= (cur) "\"") (= (cur) "'"))
|
|
(let
|
|
((q (cur)))
|
|
(do
|
|
(append! buf q)
|
|
(advance! 1)
|
|
(define
|
|
sloop
|
|
(fn
|
|
()
|
|
(cond
|
|
((>= pos src-len) nil)
|
|
((= (cur) "\\")
|
|
(do
|
|
(append! buf (cur))
|
|
(advance! 1)
|
|
(when
|
|
(< pos src-len)
|
|
(do
|
|
(append! buf (cur))
|
|
(advance! 1)))
|
|
(sloop)))
|
|
((= (cur) q)
|
|
(do (append! buf (cur)) (advance! 1)))
|
|
(else
|
|
(do
|
|
(append! buf (cur))
|
|
(advance! 1)
|
|
(sloop))))))
|
|
(sloop)
|
|
(expr-loop))))
|
|
(else
|
|
(do (append! buf (cur)) (advance! 1) (expr-loop))))))
|
|
(expr-loop)
|
|
(join "" buf))))
|
|
(define
|
|
loop
|
|
(fn
|
|
()
|
|
(cond
|
|
((>= pos src-len) nil)
|
|
((= (cur) "`") (advance! 1))
|
|
((and (= (cur) "$") (< (+ pos 1) src-len) (= (js-peek 1) "{"))
|
|
(do
|
|
(flush-chars!)
|
|
(advance! 2)
|
|
(let
|
|
((src (read-expr-source!)))
|
|
(append! parts (list "expr" src)))
|
|
(loop)))
|
|
((= (cur) "\\")
|
|
(do
|
|
(advance! 1)
|
|
(when
|
|
(< pos src-len)
|
|
(let
|
|
((ch (cur)))
|
|
(do
|
|
(cond
|
|
((= ch "n") (append! chars "\n"))
|
|
((= ch "t") (append! chars "\t"))
|
|
((= ch "r") (append! chars "\r"))
|
|
((= ch "\\") (append! chars "\\"))
|
|
((= ch "'") (append! chars "'"))
|
|
((= ch "\"") (append! chars "\""))
|
|
((= ch "`") (append! chars "`"))
|
|
((= ch "$") (append! chars "$"))
|
|
((= ch "0") (append! chars "0"))
|
|
((= ch "b") (append! chars "b"))
|
|
((= ch "f") (append! chars "f"))
|
|
((= ch "v") (append! chars "v"))
|
|
(else (append! chars ch)))
|
|
(advance! 1))))
|
|
(loop)))
|
|
(else (do (append! chars (cur)) (advance! 1) (loop))))))
|
|
(loop)
|
|
(flush-chars!)
|
|
(if
|
|
(= (len parts) 0)
|
|
""
|
|
(if
|
|
(and (= (len parts) 1) (= (nth (nth parts 0) 0) "str"))
|
|
(nth (nth parts 0) 1)
|
|
parts)))))
|
|
(define
|
|
js-regex-context?
|
|
(fn
|
|
()
|
|
(if
|
|
(= (len tokens) 0)
|
|
true
|
|
(let
|
|
((tk (nth tokens (- (len tokens) 1))))
|
|
(let
|
|
((ty (dict-get tk "type")) (vv (dict-get tk "value")))
|
|
(cond
|
|
((= ty "punct")
|
|
(and (not (= vv ")")) (not (= vv "]"))))
|
|
((= ty "op") true)
|
|
((= ty "keyword")
|
|
(contains?
|
|
(list
|
|
"return"
|
|
"typeof"
|
|
"in"
|
|
"of"
|
|
"throw"
|
|
"new"
|
|
"delete"
|
|
"instanceof"
|
|
"void"
|
|
"yield"
|
|
"await"
|
|
"case"
|
|
"do"
|
|
"else")
|
|
vv))
|
|
(else false)))))))
|
|
(define
|
|
read-regex
|
|
(fn
|
|
()
|
|
(let
|
|
((buf (list)) (in-class false))
|
|
(advance! 1)
|
|
(define
|
|
body-loop
|
|
(fn
|
|
()
|
|
(cond
|
|
((>= pos src-len) nil)
|
|
((= (cur) "\\")
|
|
(begin
|
|
(append! buf (cur))
|
|
(advance! 1)
|
|
(when
|
|
(< pos src-len)
|
|
(begin (append! buf (cur)) (advance! 1)))
|
|
(body-loop)))
|
|
((= (cur) "[")
|
|
(begin
|
|
(set! in-class true)
|
|
(append! buf (cur))
|
|
(advance! 1)
|
|
(body-loop)))
|
|
((= (cur) "]")
|
|
(begin
|
|
(set! in-class false)
|
|
(append! buf (cur))
|
|
(advance! 1)
|
|
(body-loop)))
|
|
((and (= (cur) "/") (not in-class)) (advance! 1))
|
|
(else
|
|
(begin (append! buf (cur)) (advance! 1) (body-loop))))))
|
|
(body-loop)
|
|
(let
|
|
((flags-buf (list)))
|
|
(define
|
|
flags-loop
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< pos src-len) (js-ident-char? (cur)))
|
|
(begin
|
|
(append! flags-buf (cur))
|
|
(advance! 1)
|
|
(flags-loop)))))
|
|
(flags-loop)
|
|
{:pattern (join "" buf) :flags (join "" flags-buf)}))))
|
|
(define
|
|
try-op-4!
|
|
(fn
|
|
(start)
|
|
(cond
|
|
((at? ">>>=")
|
|
(do (js-emit! "op" ">>>=" start) (advance! 4) true))
|
|
(else false))))
|
|
(define
|
|
try-op-3!
|
|
(fn
|
|
(start)
|
|
(cond
|
|
((at? "===")
|
|
(do (js-emit! "op" "===" start) (advance! 3) true))
|
|
((at? "!==")
|
|
(do (js-emit! "op" "!==" start) (advance! 3) true))
|
|
((at? "**=")
|
|
(do (js-emit! "op" "**=" start) (advance! 3) true))
|
|
((at? "<<=")
|
|
(do (js-emit! "op" "<<=" start) (advance! 3) true))
|
|
((at? ">>=")
|
|
(do (js-emit! "op" ">>=" start) (advance! 3) true))
|
|
((at? ">>>")
|
|
(do (js-emit! "op" ">>>" start) (advance! 3) true))
|
|
((at? "&&=")
|
|
(do (js-emit! "op" "&&=" start) (advance! 3) true))
|
|
((at? "||=")
|
|
(do (js-emit! "op" "||=" start) (advance! 3) true))
|
|
((at? "??=")
|
|
(do (js-emit! "op" "??=" start) (advance! 3) true))
|
|
((at? "...")
|
|
(do (js-emit! "punct" "..." start) (advance! 3) true))
|
|
(else false))))
|
|
(define
|
|
try-op-2!
|
|
(fn
|
|
(start)
|
|
(cond
|
|
((at? "==") (do (js-emit! "op" "==" start) (advance! 2) true))
|
|
((at? "!=") (do (js-emit! "op" "!=" start) (advance! 2) true))
|
|
((at? "<=") (do (js-emit! "op" "<=" start) (advance! 2) true))
|
|
((at? ">=") (do (js-emit! "op" ">=" start) (advance! 2) true))
|
|
((at? "&&") (do (js-emit! "op" "&&" start) (advance! 2) true))
|
|
((at? "||") (do (js-emit! "op" "||" start) (advance! 2) true))
|
|
((at? "??") (do (js-emit! "op" "??" start) (advance! 2) true))
|
|
((at? "=>") (do (js-emit! "op" "=>" start) (advance! 2) true))
|
|
((at? "**") (do (js-emit! "op" "**" start) (advance! 2) true))
|
|
((at? "<<") (do (js-emit! "op" "<<" start) (advance! 2) true))
|
|
((at? ">>") (do (js-emit! "op" ">>" start) (advance! 2) true))
|
|
((at? "++") (do (js-emit! "op" "++" start) (advance! 2) true))
|
|
((at? "--") (do (js-emit! "op" "--" start) (advance! 2) true))
|
|
((at? "+=") (do (js-emit! "op" "+=" start) (advance! 2) true))
|
|
((at? "-=") (do (js-emit! "op" "-=" start) (advance! 2) true))
|
|
((at? "*=") (do (js-emit! "op" "*=" start) (advance! 2) true))
|
|
((at? "/=") (do (js-emit! "op" "/=" start) (advance! 2) true))
|
|
((at? "%=") (do (js-emit! "op" "%=" start) (advance! 2) true))
|
|
((at? "&=") (do (js-emit! "op" "&=" start) (advance! 2) true))
|
|
((at? "|=") (do (js-emit! "op" "|=" start) (advance! 2) true))
|
|
((at? "^=") (do (js-emit! "op" "^=" start) (advance! 2) true))
|
|
((at? "?.") (do (js-emit! "op" "?." start) (advance! 2) true))
|
|
(else false))))
|
|
(define
|
|
emit-one-op!
|
|
(fn
|
|
(ch start)
|
|
(cond
|
|
((= ch "(") (do (js-emit! "punct" "(" start) (advance! 1)))
|
|
((= ch ")") (do (js-emit! "punct" ")" start) (advance! 1)))
|
|
((= ch "[") (do (js-emit! "punct" "[" start) (advance! 1)))
|
|
((= ch "]") (do (js-emit! "punct" "]" start) (advance! 1)))
|
|
((= ch "{") (do (js-emit! "punct" "{" start) (advance! 1)))
|
|
((= ch "}") (do (js-emit! "punct" "}" start) (advance! 1)))
|
|
((= ch ",") (do (js-emit! "punct" "," start) (advance! 1)))
|
|
((= ch ";") (do (js-emit! "punct" ";" start) (advance! 1)))
|
|
((= ch ":") (do (js-emit! "punct" ":" start) (advance! 1)))
|
|
((= ch ".") (do (js-emit! "punct" "." start) (advance! 1)))
|
|
((= ch "?") (do (js-emit! "op" "?" start) (advance! 1)))
|
|
((= ch "+") (do (js-emit! "op" "+" start) (advance! 1)))
|
|
((= ch "-") (do (js-emit! "op" "-" start) (advance! 1)))
|
|
((= ch "*") (do (js-emit! "op" "*" start) (advance! 1)))
|
|
((= ch "/") (do (js-emit! "op" "/" start) (advance! 1)))
|
|
((= ch "%") (do (js-emit! "op" "%" start) (advance! 1)))
|
|
((= ch "=") (do (js-emit! "op" "=" start) (advance! 1)))
|
|
((= ch "<") (do (js-emit! "op" "<" start) (advance! 1)))
|
|
((= ch ">") (do (js-emit! "op" ">" start) (advance! 1)))
|
|
((= ch "!") (do (js-emit! "op" "!" start) (advance! 1)))
|
|
((= ch "&") (do (js-emit! "op" "&" start) (advance! 1)))
|
|
((= ch "|") (do (js-emit! "op" "|" start) (advance! 1)))
|
|
((= ch "^") (do (js-emit! "op" "^" start) (advance! 1)))
|
|
((= ch "~") (do (js-emit! "op" "~" start) (advance! 1)))
|
|
(else (advance! 1)))))
|
|
(define
|
|
scan!
|
|
(fn
|
|
()
|
|
(do
|
|
(skip-ws!)
|
|
(when
|
|
(< pos src-len)
|
|
(let
|
|
((ch (cur)) (start pos))
|
|
(cond
|
|
((or (= ch "\"") (= ch "'"))
|
|
(do (js-emit! "string" (read-string ch) start) (scan!)))
|
|
((= ch "`")
|
|
(do (js-emit! "template" (read-template) start) (scan!)))
|
|
((js-digit? ch)
|
|
(do
|
|
(js-emit! "number" (read-number start) start)
|
|
(scan!)))
|
|
((and (= ch ".") (< (+ pos 1) src-len) (js-digit? (js-peek 1)))
|
|
(do
|
|
(js-emit! "number" (read-dot-number start) start)
|
|
(scan!)))
|
|
((js-ident-start? ch)
|
|
(do
|
|
(let
|
|
((word (read-ident start)))
|
|
(js-emit!
|
|
(if (js-keyword? word) "keyword" "ident")
|
|
word
|
|
start))
|
|
(scan!)))
|
|
((and (= ch "/") (js-regex-context?))
|
|
(let
|
|
((rx (read-regex)))
|
|
(js-emit! "regex" rx start)
|
|
(scan!)))
|
|
((try-op-4! start) (scan!))
|
|
((try-op-3! start) (scan!))
|
|
((try-op-2! start) (scan!))
|
|
(else (do (emit-one-op! ch start) (scan!)))))))))
|
|
(scan!)
|
|
(js-emit! "eof" nil pos)
|
|
tokens)))
|