Files
rose-ash/lib/js/lexer.sx
giles 081f934cad
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 1m3s
js-on-sx: lexer handles \uXXXX and \xXX string escapes
read-string fell through to the literal-char branch for \u and \x,
silently stripping the backslash ("A".length returned 5 instead
of 1). Added js-hex-value helper and two cond clauses that read the
hex digits via js-peek + js-hex-digit?, compute the code point, and
emit it via char-from-code. Invalid escapes fall through to the
literal-char behaviour. built-ins/String (with --restart-every 1):
65/99 → 68/99. conformance.sh: 148/148.
2026-05-07 12:02:30 +00:00

729 lines
25 KiB
Plaintext

;; lib/js/lexer.sx — JavaScript source → token stream
;;
;; Tokens: {:type T :value V :pos P}
;; Types:
;; "number" — numeric literals (decoded into value as number)
;; "string" — string literals (decoded, escape sequences processed)
;; "template"— template literal body (no interpolation split yet — deferred)
;; "ident" — identifier (not a reserved word)
;; "keyword" — reserved word
;; "punct" — ( ) [ ] { } , ; : . ...
;; "op" — all operator tokens (incl. = == === !== < > etc.)
;; "eof" — end of input
;;
;; NOTE: `cond` clauses take exactly ONE body expression — multi-body
;; clauses must wrap their body in `(do ...)`.
;; ── Token constructor ─────────────────────────────────────────────
(define js-make-token (fn (type value pos) {:pos pos :value value :type type}))
;; ── Character predicates ──────────────────────────────────────────
(define js-digit? (fn (c) (and (>= c "0") (<= c "9"))))
(define
js-hex-digit?
(fn
(c)
(or
(js-digit? c)
(and (>= c "a") (<= c "f"))
(and (>= c "A") (<= c "F")))))
(define
js-hex-value
(fn
(c)
(cond
((and (>= c "0") (<= c "9")) (- (char-code c) 48))
((and (>= c "a") (<= c "f")) (- (char-code c) 87))
((and (>= c "A") (<= c "F")) (- (char-code c) 55))
(else 0))))
(define
js-letter?
(fn (c) (or (and (>= c "a") (<= c "z")) (and (>= c "A") (<= c "Z")))))
(define js-ident-start? (fn (c) (or (js-letter? c) (= c "_") (= c "$"))))
(define js-ident-char? (fn (c) (or (js-ident-start? c) (js-digit? c))))
;; ── Reserved words ────────────────────────────────────────────────
(define js-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r"))))
(define
js-keywords
(list
"break"
"case"
"catch"
"class"
"const"
"continue"
"debugger"
"default"
"delete"
"do"
"else"
"export"
"extends"
"false"
"finally"
"for"
"function"
"if"
"import"
"in"
"instanceof"
"new"
"null"
"return"
"super"
"switch"
"this"
"throw"
"true"
"try"
"typeof"
"undefined"
"var"
"void"
"while"
"with"
"yield"
"let"
"static"
"async"
"await"
"of"))
;; ── Main tokenizer ────────────────────────────────────────────────
(define js-keyword? (fn (word) (contains? js-keywords word)))
(define
js-tokenize
(fn
(src)
(let
((tokens (list))
(pos 0)
(src-len (len src))
(nl-before false))
(define
js-peek
(fn
(offset)
(if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil)))
(define cur (fn () (js-peek 0)))
(define advance! (fn (n) (set! pos (+ pos n))))
(define
at?
(fn
(s)
(let
((sl (len s)))
(and (<= (+ pos sl) src-len) (= (slice src pos (+ pos sl)) s)))))
(define js-emit! (fn (type value start) (append! tokens {:nl nl-before :type type :value value :pos start})))
(define
skip-line-comment!
(fn
()
(when
(and (< pos src-len) (not (= (cur) "\n")))
(do (advance! 1) (skip-line-comment!)))))
(define
skip-block-comment!
(fn
()
(cond
((>= pos src-len) nil)
((and (= (cur) "*") (< (+ pos 1) src-len) (= (js-peek 1) "/"))
(advance! 2))
(else (do (advance! 1) (skip-block-comment!))))))
(define
skip-ws!
(fn
()
(cond
((>= pos src-len) nil)
((js-ws? (cur))
(do
(when
(or (= (cur) "\n") (= (cur) "\r"))
(set! nl-before true))
(advance! 1)
(skip-ws!)))
((and (= (cur) "/") (< (+ pos 1) src-len) (= (js-peek 1) "/"))
(do (advance! 2) (skip-line-comment!) (skip-ws!)))
((and (= (cur) "/") (< (+ pos 1) src-len) (= (js-peek 1) "*"))
(do (advance! 2) (skip-block-comment!) (skip-ws!)))
(else nil))))
(define
read-ident
(fn
(start)
(do
(when
(and (< pos src-len) (js-ident-char? (cur)))
(do (advance! 1) (read-ident start)))
(slice src start pos))))
(define
read-decimal-digits!
(fn
()
(when
(and (< pos src-len) (js-digit? (cur)))
(do (advance! 1) (read-decimal-digits!)))))
(define
read-hex-digits!
(fn
()
(when
(and (< pos src-len) (js-hex-digit? (cur)))
(do (advance! 1) (read-hex-digits!)))))
(define
read-exp-part!
(fn
()
(when
(and (< pos src-len) (or (= (cur) "e") (= (cur) "E")))
(let
((p1 (js-peek 1)))
(when
(or
(and (not (= p1 nil)) (js-digit? p1))
(and
(or (= p1 "+") (= p1 "-"))
(< (+ pos 2) src-len)
(js-digit? (js-peek 2))))
(do
(advance! 1)
(when
(and
(< pos src-len)
(or (= (cur) "+") (= (cur) "-")))
(advance! 1))
(read-decimal-digits!)))))))
(define
read-number
(fn
(start)
(cond
((and (= (cur) "0") (< (+ pos 1) src-len) (or (= (js-peek 1) "x") (= (js-peek 1) "X")))
(do
(advance! 2)
(read-hex-digits!)
(let
((raw (slice src (+ start 2) pos)))
(parse-number (str "0x" raw)))))
(else
(do
(read-decimal-digits!)
(when
(and
(< pos src-len)
(= (cur) ".")
(< (+ pos 1) src-len)
(js-digit? (js-peek 1)))
(do (advance! 1) (read-decimal-digits!)))
(read-exp-part!)
(parse-number (slice src start pos)))))))
(define
read-dot-number
(fn
(start)
(do
(advance! 1)
(read-decimal-digits!)
(read-exp-part!)
(parse-number (slice src start pos)))))
(define
read-string
(fn
(quote-char)
(let
((chars (list)))
(advance! 1)
(define
loop
(fn
()
(cond
((>= pos src-len) nil)
((= (cur) "\\")
(do
(advance! 1)
(when
(< pos src-len)
(let
((ch (cur)))
(do
(cond
((= ch "n") (append! chars "\n"))
((= ch "t") (append! chars "\t"))
((= ch "r") (append! chars "\r"))
((= ch "\\") (append! chars "\\"))
((= ch "'") (append! chars "'"))
((= ch "\"") (append! chars "\""))
((= ch "`") (append! chars "`"))
((= ch "0") (append! chars "\\0"))
((= ch "b") (append! chars "\\b"))
((= ch "f") (append! chars "\\f"))
((= ch "v") (append! chars "\\v"))
((= ch "u")
(if
(and
(< (+ pos 4) src-len)
(js-hex-digit? (js-peek 1))
(js-hex-digit? (js-peek 2))
(js-hex-digit? (js-peek 3))
(js-hex-digit? (js-peek 4)))
(do
(append!
chars
(char-from-code
(+
(*
4096
(js-hex-value
(js-peek 1)))
(*
256
(js-hex-value
(js-peek 2)))
(*
16
(js-hex-value
(js-peek 3)))
(js-hex-value (js-peek 4)))))
(advance! 4))
(append! chars ch)))
((= ch "x")
(if
(and
(< (+ pos 2) src-len)
(js-hex-digit? (js-peek 1))
(js-hex-digit? (js-peek 2)))
(do
(append!
chars
(char-from-code
(+
(* 16 (js-hex-value (js-peek 1)))
(js-hex-value (js-peek 2)))))
(advance! 2))
(append! chars ch)))
(else (append! chars ch)))
(advance! 1))))
(loop)))
((= (cur) quote-char) (advance! 1))
(else
(do (append! chars (cur)) (advance! 1) (loop))))))
(loop)
(join "" chars))))
(define
read-template
(fn
()
(let
((parts (list)) (chars (list)))
(advance! 1)
(define
flush-chars!
(fn
()
(when
(> (len chars) 0)
(do
(append! parts (list "str" (join "" chars)))
(set! chars (list))))))
(define
read-expr-source!
(fn
()
(let
((buf (list)) (depth 1))
(define
expr-loop
(fn
()
(cond
((>= pos src-len) nil)
((and (= (cur) "}") (= depth 1))
(advance! 1))
((= (cur) "}")
(do
(append! buf (cur))
(set! depth (- depth 1))
(advance! 1)
(expr-loop)))
((= (cur) "{")
(do
(append! buf (cur))
(set! depth (+ depth 1))
(advance! 1)
(expr-loop)))
((or (= (cur) "\"") (= (cur) "'"))
(let
((q (cur)))
(do
(append! buf q)
(advance! 1)
(define
sloop
(fn
()
(cond
((>= pos src-len) nil)
((= (cur) "\\")
(do
(append! buf (cur))
(advance! 1)
(when
(< pos src-len)
(do
(append! buf (cur))
(advance! 1)))
(sloop)))
((= (cur) q)
(do
(append! buf (cur))
(advance! 1)))
(else
(do
(append! buf (cur))
(advance! 1)
(sloop))))))
(sloop)
(expr-loop))))
(else
(do
(append! buf (cur))
(advance! 1)
(expr-loop))))))
(expr-loop)
(join "" buf))))
(define
loop
(fn
()
(cond
((>= pos src-len) nil)
((= (cur) "`") (advance! 1))
((and (= (cur) "$") (< (+ pos 1) src-len) (= (js-peek 1) "{"))
(do
(flush-chars!)
(advance! 2)
(let
((src (read-expr-source!)))
(append! parts (list "expr" src)))
(loop)))
((= (cur) "\\")
(do
(advance! 1)
(when
(< pos src-len)
(let
((ch (cur)))
(do
(cond
((= ch "n") (append! chars "\n"))
((= ch "t") (append! chars "\t"))
((= ch "r") (append! chars "\r"))
((= ch "\\") (append! chars "\\"))
((= ch "'") (append! chars "'"))
((= ch "\"") (append! chars "\""))
((= ch "`") (append! chars "`"))
((= ch "$") (append! chars "$"))
((= ch "0") (append! chars "0"))
((= ch "b") (append! chars "b"))
((= ch "f") (append! chars "f"))
((= ch "v") (append! chars "v"))
(else (append! chars ch)))
(advance! 1))))
(loop)))
(else
(do (append! chars (cur)) (advance! 1) (loop))))))
(loop)
(flush-chars!)
(if
(= (len parts) 0)
""
(if
(and
(= (len parts) 1)
(= (nth (nth parts 0) 0) "str"))
(nth (nth parts 0) 1)
parts)))))
(define
js-regex-context?
(fn
()
(if
(= (len tokens) 0)
true
(let
((tk (nth tokens (- (len tokens) 1))))
(let
((ty (dict-get tk "type")) (vv (dict-get tk "value")))
(cond
((= ty "punct")
(and (not (= vv ")")) (not (= vv "]"))))
((= ty "op") true)
((= ty "keyword")
(contains?
(list
"return"
"typeof"
"in"
"of"
"throw"
"new"
"delete"
"instanceof"
"void"
"yield"
"await"
"case"
"do"
"else")
vv))
(else false)))))))
(define
read-regex
(fn
()
(let
((buf (list)) (in-class false))
(advance! 1)
(define
body-loop
(fn
()
(cond
((>= pos src-len) nil)
((= (cur) "\\")
(begin
(append! buf (cur))
(advance! 1)
(when
(< pos src-len)
(begin (append! buf (cur)) (advance! 1)))
(body-loop)))
((= (cur) "[")
(begin
(set! in-class true)
(append! buf (cur))
(advance! 1)
(body-loop)))
((= (cur) "]")
(begin
(set! in-class false)
(append! buf (cur))
(advance! 1)
(body-loop)))
((and (= (cur) "/") (not in-class))
(advance! 1))
(else
(begin
(append! buf (cur))
(advance! 1)
(body-loop))))))
(body-loop)
(let
((flags-buf (list)))
(define
flags-loop
(fn
()
(when
(and (< pos src-len) (js-ident-char? (cur)))
(begin
(append! flags-buf (cur))
(advance! 1)
(flags-loop)))))
(flags-loop)
{:flags (join "" flags-buf) :pattern (join "" buf)}))))
(define
try-op-4!
(fn
(start)
(cond
((at? ">>>=")
(do (js-emit! "op" ">>>=" start) (advance! 4) true))
(else false))))
(define
try-op-3!
(fn
(start)
(cond
((at? "===")
(do (js-emit! "op" "===" start) (advance! 3) true))
((at? "!==")
(do (js-emit! "op" "!==" start) (advance! 3) true))
((at? "**=")
(do (js-emit! "op" "**=" start) (advance! 3) true))
((at? "<<=")
(do (js-emit! "op" "<<=" start) (advance! 3) true))
((at? ">>=")
(do (js-emit! "op" ">>=" start) (advance! 3) true))
((at? ">>>")
(do (js-emit! "op" ">>>" start) (advance! 3) true))
((at? "&&=")
(do (js-emit! "op" "&&=" start) (advance! 3) true))
((at? "||=")
(do (js-emit! "op" "||=" start) (advance! 3) true))
((at? "??=")
(do (js-emit! "op" "??=" start) (advance! 3) true))
((at? "...")
(do (js-emit! "punct" "..." start) (advance! 3) true))
(else false))))
(define
try-op-2!
(fn
(start)
(cond
((at? "==")
(do (js-emit! "op" "==" start) (advance! 2) true))
((at? "!=")
(do (js-emit! "op" "!=" start) (advance! 2) true))
((at? "<=")
(do (js-emit! "op" "<=" start) (advance! 2) true))
((at? ">=")
(do (js-emit! "op" ">=" start) (advance! 2) true))
((at? "&&")
(do (js-emit! "op" "&&" start) (advance! 2) true))
((at? "||")
(do (js-emit! "op" "||" start) (advance! 2) true))
((at? "??")
(do (js-emit! "op" "??" start) (advance! 2) true))
((at? "=>")
(do (js-emit! "op" "=>" start) (advance! 2) true))
((at? "**")
(do (js-emit! "op" "**" start) (advance! 2) true))
((at? "<<")
(do (js-emit! "op" "<<" start) (advance! 2) true))
((at? ">>")
(do (js-emit! "op" ">>" start) (advance! 2) true))
((at? "++")
(do (js-emit! "op" "++" start) (advance! 2) true))
((at? "--")
(do (js-emit! "op" "--" start) (advance! 2) true))
((at? "+=")
(do (js-emit! "op" "+=" start) (advance! 2) true))
((at? "-=")
(do (js-emit! "op" "-=" start) (advance! 2) true))
((at? "*=")
(do (js-emit! "op" "*=" start) (advance! 2) true))
((at? "/=")
(do (js-emit! "op" "/=" start) (advance! 2) true))
((at? "%=")
(do (js-emit! "op" "%=" start) (advance! 2) true))
((at? "&=")
(do (js-emit! "op" "&=" start) (advance! 2) true))
((at? "|=")
(do (js-emit! "op" "|=" start) (advance! 2) true))
((at? "^=")
(do (js-emit! "op" "^=" start) (advance! 2) true))
((at? "?.")
(do (js-emit! "op" "?." start) (advance! 2) true))
(else false))))
(define
emit-one-op!
(fn
(ch start)
(cond
((= ch "(")
(do (js-emit! "punct" "(" start) (advance! 1)))
((= ch ")")
(do (js-emit! "punct" ")" start) (advance! 1)))
((= ch "[")
(do (js-emit! "punct" "[" start) (advance! 1)))
((= ch "]")
(do (js-emit! "punct" "]" start) (advance! 1)))
((= ch "{")
(do (js-emit! "punct" "{" start) (advance! 1)))
((= ch "}")
(do (js-emit! "punct" "}" start) (advance! 1)))
((= ch ",")
(do (js-emit! "punct" "," start) (advance! 1)))
((= ch ";")
(do (js-emit! "punct" ";" start) (advance! 1)))
((= ch ":")
(do (js-emit! "punct" ":" start) (advance! 1)))
((= ch ".")
(do (js-emit! "punct" "." start) (advance! 1)))
((= ch "?")
(do (js-emit! "op" "?" start) (advance! 1)))
((= ch "+")
(do (js-emit! "op" "+" start) (advance! 1)))
((= ch "-")
(do (js-emit! "op" "-" start) (advance! 1)))
((= ch "*")
(do (js-emit! "op" "*" start) (advance! 1)))
((= ch "/")
(do (js-emit! "op" "/" start) (advance! 1)))
((= ch "%")
(do (js-emit! "op" "%" start) (advance! 1)))
((= ch "=")
(do (js-emit! "op" "=" start) (advance! 1)))
((= ch "<")
(do (js-emit! "op" "<" start) (advance! 1)))
((= ch ">")
(do (js-emit! "op" ">" start) (advance! 1)))
((= ch "!")
(do (js-emit! "op" "!" start) (advance! 1)))
((= ch "&")
(do (js-emit! "op" "&" start) (advance! 1)))
((= ch "|")
(do (js-emit! "op" "|" start) (advance! 1)))
((= ch "^")
(do (js-emit! "op" "^" start) (advance! 1)))
((= ch "~")
(do (js-emit! "op" "~" start) (advance! 1)))
(else (advance! 1)))))
(define
scan!
(fn
()
(do
(set! nl-before false)
(skip-ws!)
(when
(< pos src-len)
(let
((ch (cur)) (start pos))
(cond
((or (= ch "\"") (= ch "'"))
(do (js-emit! "string" (read-string ch) start) (scan!)))
((= ch "`")
(do (js-emit! "template" (read-template) start) (scan!)))
((js-digit? ch)
(do
(js-emit! "number" (read-number start) start)
(scan!)))
((and (= ch ".") (< (+ pos 1) src-len) (js-digit? (js-peek 1)))
(do
(js-emit! "number" (read-dot-number start) start)
(scan!)))
((js-ident-start? ch)
(do
(let
((word (read-ident start)))
(js-emit!
(if (js-keyword? word) "keyword" "ident")
word
start))
(scan!)))
((and (= ch "/") (js-regex-context?))
(let
((rx (read-regex)))
(js-emit! "regex" rx start)
(scan!)))
((try-op-4! start) (scan!))
((try-op-3! start) (scan!))
((try-op-2! start) (scan!))
(else (do (emit-one-op! ch start) (scan!)))))))))
(scan!)
(js-emit! "eof" nil pos)
tokens)))