HS E37: tokenizer-as-API 17/17 (+fixes)
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 16s
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 16s
- runtime.sx: fix extra ) in hs-tokens-of (parse error); add hs-eof-sentinel, hs-raw->api-token, hs-normalize-raw-tokens, hs-tokens-of, stream helpers, hs-token-type/value/op?; add \$ escape to hs-template - tokenizer.sx: fix read-number double-dot bug (1.1.1 → 3 tokens); fix t-emit! eof call (3→2 args); add bare $ case to scan-template! - compiler.sx: add \$ escape to tpl-collect template interpolation - generate-sx-tests.py: preserve \$ in process_hs_val; add generate_tokenizer_test - regen spec/tests/test-hyperscript-behavioral.sx: 17 tokenizer tests generated - plans/hs-conformance-to-100.md: row 37 marked done +17 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -893,6 +893,12 @@
|
||||
(let
|
||||
((ch (nth raw i)))
|
||||
(if
|
||||
(and (= ch "\\") (< (+ i 1) n) (= (nth raw (+ i 1)) "$"))
|
||||
(do
|
||||
(set! buf (str buf "$"))
|
||||
(set! i (+ i 2))
|
||||
(tpl-collect))
|
||||
(if
|
||||
(and (= ch "$") (< (+ i 1) n))
|
||||
(if
|
||||
(= (nth raw (+ i 1)) "{")
|
||||
@@ -931,7 +937,7 @@
|
||||
(do
|
||||
(set! buf (str buf ch))
|
||||
(set! i (+ i 1))
|
||||
(tpl-collect)))))))
|
||||
(tpl-collect))))))))
|
||||
(tpl-collect)
|
||||
(tpl-flush)
|
||||
(cons (quote str) parts))))
|
||||
|
||||
@@ -2021,6 +2021,12 @@
|
||||
(let
|
||||
((ch (nth raw i)))
|
||||
(if
|
||||
(and (= ch "\\") (< (+ i 1) n) (= (nth raw (+ i 1)) "$"))
|
||||
(do
|
||||
(set! result (str result "$"))
|
||||
(set! i (+ i 2))
|
||||
(tpl-loop))
|
||||
(if
|
||||
(and (= ch "$") (< (+ i 1) n))
|
||||
(if
|
||||
(= (nth raw (+ i 1)) "{")
|
||||
@@ -2089,7 +2095,7 @@
|
||||
(do
|
||||
(set! result (str result ch))
|
||||
(set! i (+ i 1))
|
||||
(tpl-loop)))))))
|
||||
(tpl-loop))))))))
|
||||
(do (tpl-loop) result))))
|
||||
|
||||
(define
|
||||
@@ -2525,3 +2531,188 @@
|
||||
(fn
|
||||
(fn-name args)
|
||||
(let ((fn (host-global fn-name))) (if fn (host-call-fn fn args) nil))))
|
||||
|
||||
;; ── E37 Tokenizer-as-API ─────────────────────────────────────────────
|
||||
|
||||
(define hs-eof-sentinel (fn () {:type "EOF" :value "<<<EOF>>>" :op false}))
|
||||
|
||||
(define
|
||||
hs-op-type
|
||||
(fn
|
||||
(val)
|
||||
(cond
|
||||
((= val "+") "PLUS")
|
||||
((= val "-") "MINUS")
|
||||
((= val "*") "MULTIPLY")
|
||||
((= val "/") "SLASH")
|
||||
((= val "%") "PERCENT")
|
||||
((= val "|") "PIPE")
|
||||
((= val "!") "EXCLAMATION")
|
||||
((= val "?") "QUESTION")
|
||||
((= val "#") "POUND")
|
||||
((= val "&") "AMPERSAND")
|
||||
((= val ";") "SEMI")
|
||||
((= val "=") "EQUALS")
|
||||
((= val "<") "L_ANG")
|
||||
((= val ">") "R_ANG")
|
||||
((= val "<=") "LTE_ANG")
|
||||
((= val ">=") "GTE_ANG")
|
||||
((= val "==") "EQ")
|
||||
((= val "===") "EQQ")
|
||||
((= val "\\") "BACKSLASH")
|
||||
(true (str "OP_" val)))))
|
||||
|
||||
(define
|
||||
hs-raw->api-token
|
||||
(fn
|
||||
(tok)
|
||||
(let
|
||||
((raw-type (get tok "type"))
|
||||
(raw-val (get tok "value")))
|
||||
(let
|
||||
((up-type
|
||||
(cond
|
||||
((or (= raw-type "ident") (= raw-type "keyword")) "IDENTIFIER")
|
||||
((= raw-type "number") "NUMBER")
|
||||
((= raw-type "string") "STRING")
|
||||
((= raw-type "class") "CLASS_REF")
|
||||
((= raw-type "id") "ID_REF")
|
||||
((= raw-type "attr") "ATTRIBUTE_REF")
|
||||
((= raw-type "style") "STYLE_REF")
|
||||
((= raw-type "selector") "QUERY_REF")
|
||||
((= raw-type "eof") "EOF")
|
||||
((= raw-type "paren-open") "L_PAREN")
|
||||
((= raw-type "paren-close") "R_PAREN")
|
||||
((= raw-type "bracket-open") "L_BRACKET")
|
||||
((= raw-type "bracket-close") "R_BRACKET")
|
||||
((= raw-type "brace-open") "L_BRACE")
|
||||
((= raw-type "brace-close") "R_BRACE")
|
||||
((= raw-type "comma") "COMMA")
|
||||
((= raw-type "dot") "PERIOD")
|
||||
((= raw-type "colon") "COLON")
|
||||
((= raw-type "op") (hs-op-type raw-val))
|
||||
(true (str "UNKNOWN_" raw-type))))
|
||||
(up-val
|
||||
(cond
|
||||
((= raw-type "class") (str "." raw-val))
|
||||
((= raw-type "id") (str "#" raw-val))
|
||||
((= raw-type "eof") "<<<EOF>>>")
|
||||
(true raw-val)))
|
||||
(is-op
|
||||
(or
|
||||
(= raw-type "paren-open")
|
||||
(= raw-type "paren-close")
|
||||
(= raw-type "bracket-open")
|
||||
(= raw-type "bracket-close")
|
||||
(= raw-type "brace-open")
|
||||
(= raw-type "brace-close")
|
||||
(= raw-type "comma")
|
||||
(= raw-type "dot")
|
||||
(= raw-type "colon")
|
||||
(= raw-type "op"))))
|
||||
{:type up-type :value up-val :op is-op}))))
|
||||
|
||||
;; Expand "class" and "id" tokens that follow a closing bracket into
|
||||
;; separate dot/hash + ident tokens, matching upstream context-sensitive
|
||||
;; behaviour: after ) ] } the dot is property access, not a CLASS_REF.
|
||||
(define
|
||||
hs-normalize-raw-tokens
|
||||
(fn
|
||||
(raw-real)
|
||||
(let
|
||||
((result (list))
|
||||
(prev-type nil))
|
||||
(for-each
|
||||
(fn
|
||||
(tok)
|
||||
(let
|
||||
((typ (get tok "type"))
|
||||
(val (get tok "value"))
|
||||
(tok-pos (get tok "pos")))
|
||||
(if
|
||||
(and
|
||||
(or (= typ "class") (= typ "id"))
|
||||
(or
|
||||
(= prev-type "paren-close")
|
||||
(= prev-type "bracket-close")
|
||||
(= prev-type "brace-close")))
|
||||
(do
|
||||
(if
|
||||
(= typ "class")
|
||||
(do
|
||||
(append! result {:type "dot" :value "." :pos tok-pos})
|
||||
(append! result {:type "ident" :value val :pos (+ tok-pos 1)}))
|
||||
(do
|
||||
(append! result {:type "op" :value "#" :pos tok-pos})
|
||||
(append! result {:type "ident" :value val :pos (+ tok-pos 1)})))
|
||||
(set! prev-type "ident"))
|
||||
(do
|
||||
(append! result tok)
|
||||
(set! prev-type typ)))))
|
||||
raw-real)
|
||||
result)))
|
||||
|
||||
(define
|
||||
hs-tokens-of
|
||||
(fn
|
||||
(src &rest rest)
|
||||
(let
|
||||
((template? (and (> (len rest) 0) (= (first rest) :template)))
|
||||
(raw (if template? (hs-tokenize-template src) (hs-tokenize src))))
|
||||
(if
|
||||
template?
|
||||
{:source src :list (map hs-raw->api-token raw) :pos 0}
|
||||
;; Normal mode: filter EOF, context-normalise, add trailing-WS sentinel
|
||||
(let
|
||||
((real (filter (fn (t) (not (= (get t "type") "eof"))) raw)))
|
||||
(let
|
||||
((norm (hs-normalize-raw-tokens real)))
|
||||
(let
|
||||
((api (map hs-raw->api-token norm)))
|
||||
(let
|
||||
((with-sep
|
||||
(if
|
||||
(and
|
||||
(> (len norm) 0)
|
||||
(let
|
||||
((last-tok (nth norm (- (len norm) 1))))
|
||||
(let
|
||||
((end-pos
|
||||
(+ (get last-tok "pos")
|
||||
(len (get last-tok "value")))))
|
||||
(and
|
||||
(< end-pos (len src))
|
||||
(hs-ws? (nth src end-pos))))))
|
||||
(append api (list {:type "WHITESPACE" :value " " :op false}))
|
||||
api)))
|
||||
{:source src :list with-sep :pos 0}))))))))
|
||||
|
||||
|
||||
(define
|
||||
hs-stream-token
|
||||
(fn
|
||||
(s i)
|
||||
(let
|
||||
((lst (get s "list"))
|
||||
(pos (get s "pos")))
|
||||
(or (nth lst (+ pos i))
|
||||
(hs-eof-sentinel)))))
|
||||
|
||||
(define
|
||||
hs-stream-consume
|
||||
(fn
|
||||
(s)
|
||||
(let
|
||||
((tok (hs-stream-token s 0)))
|
||||
(when
|
||||
(not (= (get tok "type") "EOF"))
|
||||
(dict-set! s "pos" (+ (get s "pos") 1)))
|
||||
tok)))
|
||||
|
||||
(define
|
||||
hs-stream-has-more
|
||||
(fn (s) (not (= (get (hs-stream-token s 0) "type") "EOF"))))
|
||||
|
||||
(define hs-token-type (fn (tok) (get tok "type")))
|
||||
(define hs-token-value (fn (tok) (get tok "value")))
|
||||
(define hs-token-op? (fn (tok) (get tok "op")))
|
||||
|
||||
@@ -28,6 +28,27 @@
|
||||
|
||||
(define hs-ws? (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r"))))
|
||||
|
||||
(define
|
||||
hs-hex-digit?
|
||||
(fn
|
||||
(c)
|
||||
(or
|
||||
(and (>= c "0") (<= c "9"))
|
||||
(and (>= c "a") (<= c "f"))
|
||||
(and (>= c "A") (<= c "F")))))
|
||||
|
||||
(define
|
||||
hs-hex-val
|
||||
(fn
|
||||
(c)
|
||||
(let
|
||||
((code (char-code c)))
|
||||
(cond
|
||||
((and (>= code 48) (<= code 57)) (- code 48))
|
||||
((and (>= code 65) (<= code 70)) (- code 55))
|
||||
((and (>= code 97) (<= code 102)) (- code 87))
|
||||
(true 0)))))
|
||||
|
||||
;; ── Keyword set ───────────────────────────────────────────────────
|
||||
|
||||
(define
|
||||
@@ -235,10 +256,15 @@
|
||||
read-number
|
||||
(fn
|
||||
(start)
|
||||
(when
|
||||
(and (< pos src-len) (hs-digit? (hs-cur)))
|
||||
(hs-advance! 1)
|
||||
(read-number start))
|
||||
(define
|
||||
read-int
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and (< pos src-len) (hs-digit? (hs-cur)))
|
||||
(hs-advance! 1)
|
||||
(read-int))))
|
||||
(read-int)
|
||||
(when
|
||||
(and
|
||||
(< pos src-len)
|
||||
@@ -246,15 +272,7 @@
|
||||
(< (+ pos 1) src-len)
|
||||
(hs-digit? (hs-peek 1)))
|
||||
(hs-advance! 1)
|
||||
(define
|
||||
read-frac
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and (< pos src-len) (hs-digit? (hs-cur)))
|
||||
(hs-advance! 1)
|
||||
(read-frac))))
|
||||
(read-frac))
|
||||
(read-int))
|
||||
(do
|
||||
(when
|
||||
(and
|
||||
@@ -272,15 +290,7 @@
|
||||
(< pos src-len)
|
||||
(or (= (hs-cur) "+") (= (hs-cur) "-")))
|
||||
(hs-advance! 1))
|
||||
(define
|
||||
read-exp-digits
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and (< pos src-len) (hs-digit? (hs-cur)))
|
||||
(hs-advance! 1)
|
||||
(read-exp-digits))))
|
||||
(read-exp-digits))
|
||||
(read-int))
|
||||
(let
|
||||
((num-end pos))
|
||||
(when
|
||||
@@ -308,7 +318,7 @@
|
||||
()
|
||||
(cond
|
||||
(>= pos src-len)
|
||||
nil
|
||||
(error "Unterminated string")
|
||||
(= (hs-cur) "\\")
|
||||
(do
|
||||
(hs-advance! 1)
|
||||
@@ -318,15 +328,37 @@
|
||||
((ch (hs-cur)))
|
||||
(cond
|
||||
(= ch "n")
|
||||
(append! chars "\n")
|
||||
(do (append! chars "\n") (hs-advance! 1))
|
||||
(= ch "t")
|
||||
(append! chars "\t")
|
||||
(do (append! chars "\t") (hs-advance! 1))
|
||||
(= ch "r")
|
||||
(do (append! chars "\r") (hs-advance! 1))
|
||||
(= ch "b")
|
||||
(do (append! chars (char-from-code 8)) (hs-advance! 1))
|
||||
(= ch "f")
|
||||
(do (append! chars (char-from-code 12)) (hs-advance! 1))
|
||||
(= ch "v")
|
||||
(do (append! chars (char-from-code 11)) (hs-advance! 1))
|
||||
(= ch "\\")
|
||||
(append! chars "\\")
|
||||
(do (append! chars "\\") (hs-advance! 1))
|
||||
(= ch quote-char)
|
||||
(append! chars quote-char)
|
||||
:else (do (append! chars "\\") (append! chars ch)))
|
||||
(hs-advance! 1)))
|
||||
(do (append! chars quote-char) (hs-advance! 1))
|
||||
(= ch "x")
|
||||
(do
|
||||
(hs-advance! 1)
|
||||
(if
|
||||
(and
|
||||
(< (+ pos 1) src-len)
|
||||
(hs-hex-digit? (hs-cur))
|
||||
(hs-hex-digit? (hs-peek 1)))
|
||||
(let
|
||||
((d1 (hs-hex-val (hs-cur)))
|
||||
(d2 (hs-hex-val (hs-peek 1))))
|
||||
(append! chars (char-from-code (+ (* d1 16) d2)))
|
||||
(hs-advance! 2))
|
||||
(error "Invalid hexadecimal escape: \\x")))
|
||||
:else
|
||||
(do (append! chars "\\") (append! chars ch) (hs-advance! 1)))))
|
||||
(loop))
|
||||
(= (hs-cur) quote-char)
|
||||
(hs-advance! 1)
|
||||
@@ -620,7 +652,82 @@
|
||||
(do (hs-emit! "colon" ":" start) (hs-advance! 1) (scan!))
|
||||
(= ch "|")
|
||||
(do (hs-emit! "op" "|" start) (hs-advance! 1) (scan!))
|
||||
(= ch "&")
|
||||
(do (hs-emit! "op" "&" start) (hs-advance! 1) (scan!))
|
||||
(= ch "#")
|
||||
(do (hs-emit! "op" "#" start) (hs-advance! 1) (scan!))
|
||||
(= ch "?")
|
||||
(do (hs-emit! "op" "?" start) (hs-advance! 1) (scan!))
|
||||
(= ch ";")
|
||||
(do (hs-emit! "op" ";" start) (hs-advance! 1) (scan!))
|
||||
:else (do (hs-advance! 1) (scan!)))))))
|
||||
(scan!)
|
||||
(hs-emit! "eof" nil pos)
|
||||
tokens)))
|
||||
|
||||
;; ── Template-mode tokenizer (E37 API) ────────────────────────────────
|
||||
;; Used by hs-tokens-of when :template flag is set.
|
||||
;; Emits outer " chars as single STRING tokens; ${ ... } as $ { <inner-tokens> };
|
||||
;; inner content is tokenized with the regular hs-tokenize.
|
||||
|
||||
(define
|
||||
hs-tokenize-template
|
||||
(fn
|
||||
(src)
|
||||
(let
|
||||
((tokens (list)) (pos 0) (src-len (len src)))
|
||||
(define t-cur (fn () (if (< pos src-len) (nth src pos) nil)))
|
||||
(define t-peek (fn (n) (if (< (+ pos n) src-len) (nth src (+ pos n)) nil)))
|
||||
(define t-advance! (fn (n) (set! pos (+ pos n))))
|
||||
(define t-emit! (fn (type value) (append! tokens (hs-make-token type value pos))))
|
||||
(define
|
||||
scan-to-close!
|
||||
(fn
|
||||
(depth)
|
||||
(when
|
||||
(and (< pos src-len) (> depth 0))
|
||||
(cond
|
||||
(= (t-cur) "{")
|
||||
(do (t-advance! 1) (scan-to-close! (+ depth 1)))
|
||||
(= (t-cur) "}")
|
||||
(when (> (- depth 1) 0) (t-advance! 1) (scan-to-close! (- depth 1)))
|
||||
:else (do (t-advance! 1) (scan-to-close! depth))))))
|
||||
(define
|
||||
scan-template!
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(< pos src-len)
|
||||
(let
|
||||
((ch (t-cur)))
|
||||
(cond
|
||||
(= ch "\"")
|
||||
(do (t-emit! "string" "\"") (t-advance! 1) (scan-template!))
|
||||
(and (= ch "$") (= (t-peek 1) "{"))
|
||||
(do
|
||||
(t-emit! "op" "$")
|
||||
(t-advance! 1)
|
||||
(t-emit! "brace-open" "{")
|
||||
(t-advance! 1)
|
||||
(let
|
||||
((inner-start pos))
|
||||
(scan-to-close! 1)
|
||||
(let
|
||||
((inner-src (slice src inner-start pos))
|
||||
(inner-toks (hs-tokenize inner-src)))
|
||||
(for-each
|
||||
(fn (tok)
|
||||
(when (not (= (get tok "type") "eof"))
|
||||
(append! tokens tok)))
|
||||
inner-toks))
|
||||
(t-emit! "brace-close" "}")
|
||||
(when (< pos src-len) (t-advance! 1)))
|
||||
(scan-template!))
|
||||
(= ch "$")
|
||||
(do (t-emit! "op" "$") (t-advance! 1) (scan-template!))
|
||||
(hs-ws? ch)
|
||||
(do (t-advance! 1) (scan-template!))
|
||||
:else (do (t-advance! 1) (scan-template!)))))))
|
||||
(scan-template!)
|
||||
(t-emit! "eof" nil)
|
||||
tokens)))
|
||||
Reference in New Issue
Block a user