Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 52s
Idents, ctors, 51 keywords, numbers (int/float/hex/exp/underscored), strings + chars with escapes, type variables, 26 op/punct tokens, and nested (* ... *) block comments. Tests via epoch protocol against sx_server.exe.
383 lines
12 KiB
Plaintext
383 lines
12 KiB
Plaintext
;; lib/ocaml/tokenizer.sx — OCaml lexer.
|
|
;;
|
|
;; Tokens: ident, ctor (uppercase ident), keyword, number, string, char, op, eof.
|
|
;; Token shape: {:type :value :pos} via lex-make-token.
|
|
;; OCaml is not indentation-sensitive — no layout pass.
|
|
;; Block comments (* ... *) nest. There is no line-comment syntax.
|
|
|
|
(prefix-rename
|
|
"ocaml-"
|
|
(quote
|
|
((make-token lex-make-token)
|
|
(digit? lex-digit?)
|
|
(hex-digit? lex-hex-digit?)
|
|
(alpha? lex-alpha?)
|
|
(alnum? lex-alnum?)
|
|
(ident-start? lex-ident-start?)
|
|
(ident-char? lex-ident-char?)
|
|
(ws? lex-whitespace?))))
|
|
|
|
(define
|
|
ocaml-keywords
|
|
(list
|
|
"and"
|
|
"as"
|
|
"assert"
|
|
"begin"
|
|
"class"
|
|
"constraint"
|
|
"do"
|
|
"done"
|
|
"downto"
|
|
"else"
|
|
"end"
|
|
"exception"
|
|
"external"
|
|
"false"
|
|
"for"
|
|
"fun"
|
|
"function"
|
|
"functor"
|
|
"if"
|
|
"in"
|
|
"include"
|
|
"inherit"
|
|
"initializer"
|
|
"lazy"
|
|
"let"
|
|
"match"
|
|
"method"
|
|
"module"
|
|
"mutable"
|
|
"new"
|
|
"nonrec"
|
|
"object"
|
|
"of"
|
|
"open"
|
|
"or"
|
|
"private"
|
|
"rec"
|
|
"sig"
|
|
"struct"
|
|
"then"
|
|
"to"
|
|
"true"
|
|
"try"
|
|
"type"
|
|
"val"
|
|
"virtual"
|
|
"when"
|
|
"while"
|
|
"with"
|
|
"land"
|
|
"lor"
|
|
"lxor"
|
|
"lsl"
|
|
"lsr"
|
|
"asr"
|
|
"mod"))
|
|
|
|
(define ocaml-keyword? (fn (word) (contains? ocaml-keywords word)))
|
|
|
|
(define
|
|
ocaml-upper?
|
|
(fn (c) (and (not (= c nil)) (>= c "A") (<= c "Z"))))
|
|
|
|
(define
|
|
ocaml-tokenize
|
|
(fn
|
|
(src)
|
|
(let
|
|
((tokens (list)) (pos 0) (src-len (len src)))
|
|
(define
|
|
ocaml-peek
|
|
(fn
|
|
(offset)
|
|
(if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil)))
|
|
(define cur (fn () (ocaml-peek 0)))
|
|
(define advance! (fn (n) (set! pos (+ pos n))))
|
|
(define
|
|
push!
|
|
(fn
|
|
(type value start)
|
|
(append! tokens (ocaml-make-token type value start))))
|
|
(define
|
|
skip-block-comment!
|
|
(fn
|
|
(depth)
|
|
(cond
|
|
((>= pos src-len) nil)
|
|
((and (= (cur) "*") (= (ocaml-peek 1) ")"))
|
|
(begin
|
|
(advance! 2)
|
|
(when
|
|
(> depth 1)
|
|
(skip-block-comment! (- depth 1)))))
|
|
((and (= (cur) "(") (= (ocaml-peek 1) "*"))
|
|
(begin
|
|
(advance! 2)
|
|
(skip-block-comment! (+ depth 1))))
|
|
(else (begin (advance! 1) (skip-block-comment! depth))))))
|
|
(define
|
|
skip-ws!
|
|
(fn
|
|
()
|
|
(cond
|
|
((>= pos src-len) nil)
|
|
((ocaml-ws? (cur)) (begin (advance! 1) (skip-ws!)))
|
|
((and (= (cur) "(") (= (ocaml-peek 1) "*"))
|
|
(begin
|
|
(advance! 2)
|
|
(skip-block-comment! 1)
|
|
(skip-ws!)))
|
|
(else nil))))
|
|
(define
|
|
read-ident
|
|
(fn
|
|
(start)
|
|
(begin
|
|
(when
|
|
(and (< pos src-len) (ocaml-ident-char? (cur)))
|
|
(begin (advance! 1) (read-ident start)))
|
|
(when
|
|
(and (< pos src-len) (= (cur) "'"))
|
|
(begin (advance! 1) (read-ident start)))
|
|
(slice src start pos))))
|
|
(define
|
|
read-decimal-digits!
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< pos src-len) (or (ocaml-digit? (cur)) (= (cur) "_")))
|
|
(begin (advance! 1) (read-decimal-digits!)))))
|
|
(define
|
|
read-hex-digits!
|
|
(fn
|
|
()
|
|
(when
|
|
(and
|
|
(< pos src-len)
|
|
(or (ocaml-hex-digit? (cur)) (= (cur) "_")))
|
|
(begin (advance! 1) (read-hex-digits!)))))
|
|
(define
|
|
read-exp-part!
|
|
(fn
|
|
()
|
|
(when
|
|
(and (< pos src-len) (or (= (cur) "e") (= (cur) "E")))
|
|
(let
|
|
((p1 (ocaml-peek 1)))
|
|
(when
|
|
(or
|
|
(and (not (= p1 nil)) (ocaml-digit? p1))
|
|
(and
|
|
(or (= p1 "+") (= p1 "-"))
|
|
(< (+ pos 2) src-len)
|
|
(ocaml-digit? (ocaml-peek 2))))
|
|
(begin
|
|
(advance! 1)
|
|
(when
|
|
(and
|
|
(< pos src-len)
|
|
(or (= (cur) "+") (= (cur) "-")))
|
|
(advance! 1))
|
|
(read-decimal-digits!)))))))
|
|
(define
|
|
strip-underscores
|
|
(fn
|
|
(s)
|
|
(let
|
|
((out (list)) (i 0) (n (len s)))
|
|
(begin
|
|
(define
|
|
loop
|
|
(fn
|
|
()
|
|
(when
|
|
(< i n)
|
|
(begin
|
|
(when
|
|
(not (= (nth s i) "_"))
|
|
(append! out (nth s i)))
|
|
(set! i (+ i 1))
|
|
(loop)))))
|
|
(loop)
|
|
(join "" out)))))
|
|
(define
|
|
read-number
|
|
(fn
|
|
(start)
|
|
(cond
|
|
((and (= (cur) "0") (< (+ pos 1) src-len) (or (= (ocaml-peek 1) "x") (= (ocaml-peek 1) "X")))
|
|
(begin
|
|
(advance! 2)
|
|
(read-hex-digits!)
|
|
(let
|
|
((raw (slice src (+ start 2) pos)))
|
|
(parse-number (str "0x" (strip-underscores raw))))))
|
|
(else
|
|
(begin
|
|
(read-decimal-digits!)
|
|
(when
|
|
(and
|
|
(< pos src-len)
|
|
(= (cur) ".")
|
|
(or
|
|
(>= (+ pos 1) src-len)
|
|
(not (= (ocaml-peek 1) "."))))
|
|
(begin (advance! 1) (read-decimal-digits!)))
|
|
(read-exp-part!)
|
|
(parse-number (strip-underscores (slice src start pos))))))))
|
|
(define
|
|
read-string-literal
|
|
(fn
|
|
()
|
|
(let
|
|
((chars (list)))
|
|
(begin
|
|
(advance! 1)
|
|
(define
|
|
loop
|
|
(fn
|
|
()
|
|
(cond
|
|
((>= pos src-len) nil)
|
|
((= (cur) "\\")
|
|
(begin
|
|
(advance! 1)
|
|
(when
|
|
(< pos src-len)
|
|
(let
|
|
((ch (cur)))
|
|
(begin
|
|
(cond
|
|
((= ch "n") (append! chars "\n"))
|
|
((= ch "t") (append! chars "\t"))
|
|
((= ch "r") (append! chars "\r"))
|
|
((= ch "b") (append! chars "\\b"))
|
|
((= ch "\\") (append! chars "\\"))
|
|
((= ch "'") (append! chars "'"))
|
|
((= ch "\"") (append! chars "\""))
|
|
((= ch " ") nil)
|
|
(else (append! chars ch)))
|
|
(advance! 1))))
|
|
(loop)))
|
|
((= (cur) "\"") (advance! 1))
|
|
(else
|
|
(begin
|
|
(append! chars (cur))
|
|
(advance! 1)
|
|
(loop))))))
|
|
(loop)
|
|
(join "" chars)))))
|
|
(define
|
|
read-char-literal
|
|
(fn
|
|
()
|
|
(begin
|
|
(advance! 1)
|
|
(let
|
|
((value (cond ((= (cur) "\\") (begin (advance! 1) (let ((ch (cur))) (begin (advance! 1) (cond ((= ch "n") "\n") ((= ch "t") "\t") ((= ch "r") "\r") ((= ch "b") "\\b") ((= ch "\\") "\\") ((= ch "'") "'") ((= ch "\"") "\"") (else ch)))))) (else (let ((ch (cur))) (begin (advance! 1) ch))))))
|
|
(begin
|
|
(when
|
|
(and (< pos src-len) (= (cur) "'"))
|
|
(advance! 1))
|
|
value)))))
|
|
(define
|
|
try-punct
|
|
(fn
|
|
(start)
|
|
(let
|
|
((c (cur))
|
|
(c1 (ocaml-peek 1))
|
|
(c2 (ocaml-peek 2)))
|
|
(cond
|
|
((and (= c ";") (= c1 ";"))
|
|
(begin (advance! 2) (push! "op" ";;" start) true))
|
|
((and (= c "-") (= c1 ">"))
|
|
(begin (advance! 2) (push! "op" "->" start) true))
|
|
((and (= c "<") (= c1 "-"))
|
|
(begin (advance! 2) (push! "op" "<-" start) true))
|
|
((and (= c ":") (= c1 "="))
|
|
(begin (advance! 2) (push! "op" ":=" start) true))
|
|
((and (= c ":") (= c1 ":"))
|
|
(begin (advance! 2) (push! "op" "::" start) true))
|
|
((and (= c "|") (= c1 "|"))
|
|
(begin (advance! 2) (push! "op" "||" start) true))
|
|
((and (= c "&") (= c1 "&"))
|
|
(begin (advance! 2) (push! "op" "&&" start) true))
|
|
((and (= c "<") (= c1 "="))
|
|
(begin (advance! 2) (push! "op" "<=" start) true))
|
|
((and (= c ">") (= c1 "="))
|
|
(begin (advance! 2) (push! "op" ">=" start) true))
|
|
((and (= c "<") (= c1 ">"))
|
|
(begin (advance! 2) (push! "op" "<>" start) true))
|
|
((and (= c "=") (= c1 "="))
|
|
(begin (advance! 2) (push! "op" "==" start) true))
|
|
((and (= c "!") (= c1 "="))
|
|
(begin (advance! 2) (push! "op" "!=" start) true))
|
|
((and (= c "|") (= c1 ">"))
|
|
(begin (advance! 2) (push! "op" "|>" start) true))
|
|
((and (= c "<") (= c1 "|"))
|
|
(begin (advance! 2) (push! "op" "<|" start) true))
|
|
((and (= c "@") (= c1 "@"))
|
|
(begin (advance! 2) (push! "op" "@@" start) true))
|
|
((and (= c "*") (= c1 "*"))
|
|
(begin (advance! 2) (push! "op" "**" start) true))
|
|
((or (= c "+") (= c "-") (= c "*") (= c "/") (= c "%") (= c "^") (= c "<") (= c ">") (= c "=") (= c "(") (= c ")") (= c "{") (= c "}") (= c "[") (= c "]") (= c ";") (= c ":") (= c ",") (= c ".") (= c "|") (= c "!") (= c "&") (= c "@") (= c "?") (= c "~") (= c "#"))
|
|
(begin (advance! 1) (push! "op" c start) true))
|
|
(else false)))))
|
|
(define
|
|
step
|
|
(fn
|
|
()
|
|
(begin
|
|
(skip-ws!)
|
|
(when
|
|
(< pos src-len)
|
|
(let
|
|
((start pos) (c (cur)))
|
|
(cond
|
|
((ocaml-ident-start? c)
|
|
(let
|
|
((word (read-ident start)))
|
|
(begin
|
|
(cond
|
|
((ocaml-keyword? word)
|
|
(push! "keyword" word start))
|
|
((ocaml-upper? c) (push! "ctor" word start))
|
|
(else (push! "ident" word start)))
|
|
(step))))
|
|
((ocaml-digit? c)
|
|
(let
|
|
((v (read-number start)))
|
|
(begin (push! "number" v start) (step))))
|
|
((= c "\"")
|
|
(let
|
|
((s (read-string-literal)))
|
|
(begin (push! "string" s start) (step))))
|
|
((and (= c "'") (< (+ pos 1) src-len) (or (and (= (ocaml-peek 1) "\\") (< (+ pos 3) src-len) (= (ocaml-peek 3) "'")) (and (not (= (ocaml-peek 1) "\\")) (< (+ pos 2) src-len) (= (ocaml-peek 2) "'"))))
|
|
(let
|
|
((v (read-char-literal)))
|
|
(begin (push! "char" v start) (step))))
|
|
((= c "'")
|
|
(begin
|
|
(advance! 1)
|
|
(when
|
|
(and (< pos src-len) (ocaml-ident-start? (cur)))
|
|
(begin
|
|
(advance! 1)
|
|
(read-ident (+ start 1))))
|
|
(push!
|
|
"tyvar"
|
|
(slice src (+ start 1) pos)
|
|
start)
|
|
(step)))
|
|
((try-punct start) (step))
|
|
(else
|
|
(error
|
|
(str "ocaml-tokenize: unexpected char " c " at " pos)))))))))
|
|
(step)
|
|
(push! "eof" nil pos)
|
|
tokens)))
|