ocaml: phase 1 tokenizer (+58 tests) — consumes lib/guest/lex.sx
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 52s
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 52s
Idents, ctors, 51 keywords, numbers (int/float/hex/exp/underscored), strings + chars with escapes, type variables, 26 op/punct tokens, and nested (* ... *) block comments. Tests via epoch protocol against sx_server.exe.
This commit is contained in:
382
lib/ocaml/tokenizer.sx
Normal file
382
lib/ocaml/tokenizer.sx
Normal file
@@ -0,0 +1,382 @@
|
||||
;; lib/ocaml/tokenizer.sx — OCaml lexer.
|
||||
;;
|
||||
;; Tokens: ident, ctor (uppercase ident), keyword, number, string, char, op, eof.
|
||||
;; Token shape: {:type :value :pos} via lex-make-token.
|
||||
;; OCaml is not indentation-sensitive — no layout pass.
|
||||
;; Block comments (* ... *) nest. There is no line-comment syntax.
|
||||
|
||||
(prefix-rename
|
||||
"ocaml-"
|
||||
(quote
|
||||
((make-token lex-make-token)
|
||||
(digit? lex-digit?)
|
||||
(hex-digit? lex-hex-digit?)
|
||||
(alpha? lex-alpha?)
|
||||
(alnum? lex-alnum?)
|
||||
(ident-start? lex-ident-start?)
|
||||
(ident-char? lex-ident-char?)
|
||||
(ws? lex-whitespace?))))
|
||||
|
||||
(define
|
||||
ocaml-keywords
|
||||
(list
|
||||
"and"
|
||||
"as"
|
||||
"assert"
|
||||
"begin"
|
||||
"class"
|
||||
"constraint"
|
||||
"do"
|
||||
"done"
|
||||
"downto"
|
||||
"else"
|
||||
"end"
|
||||
"exception"
|
||||
"external"
|
||||
"false"
|
||||
"for"
|
||||
"fun"
|
||||
"function"
|
||||
"functor"
|
||||
"if"
|
||||
"in"
|
||||
"include"
|
||||
"inherit"
|
||||
"initializer"
|
||||
"lazy"
|
||||
"let"
|
||||
"match"
|
||||
"method"
|
||||
"module"
|
||||
"mutable"
|
||||
"new"
|
||||
"nonrec"
|
||||
"object"
|
||||
"of"
|
||||
"open"
|
||||
"or"
|
||||
"private"
|
||||
"rec"
|
||||
"sig"
|
||||
"struct"
|
||||
"then"
|
||||
"to"
|
||||
"true"
|
||||
"try"
|
||||
"type"
|
||||
"val"
|
||||
"virtual"
|
||||
"when"
|
||||
"while"
|
||||
"with"
|
||||
"land"
|
||||
"lor"
|
||||
"lxor"
|
||||
"lsl"
|
||||
"lsr"
|
||||
"asr"
|
||||
"mod"))
|
||||
|
||||
(define ocaml-keyword? (fn (word) (contains? ocaml-keywords word)))
|
||||
|
||||
(define
|
||||
ocaml-upper?
|
||||
(fn (c) (and (not (= c nil)) (>= c "A") (<= c "Z"))))
|
||||
|
||||
(define
|
||||
ocaml-tokenize
|
||||
(fn
|
||||
(src)
|
||||
(let
|
||||
((tokens (list)) (pos 0) (src-len (len src)))
|
||||
(define
|
||||
ocaml-peek
|
||||
(fn
|
||||
(offset)
|
||||
(if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil)))
|
||||
(define cur (fn () (ocaml-peek 0)))
|
||||
(define advance! (fn (n) (set! pos (+ pos n))))
|
||||
(define
|
||||
push!
|
||||
(fn
|
||||
(type value start)
|
||||
(append! tokens (ocaml-make-token type value start))))
|
||||
(define
|
||||
skip-block-comment!
|
||||
(fn
|
||||
(depth)
|
||||
(cond
|
||||
((>= pos src-len) nil)
|
||||
((and (= (cur) "*") (= (ocaml-peek 1) ")"))
|
||||
(begin
|
||||
(advance! 2)
|
||||
(when
|
||||
(> depth 1)
|
||||
(skip-block-comment! (- depth 1)))))
|
||||
((and (= (cur) "(") (= (ocaml-peek 1) "*"))
|
||||
(begin
|
||||
(advance! 2)
|
||||
(skip-block-comment! (+ depth 1))))
|
||||
(else (begin (advance! 1) (skip-block-comment! depth))))))
|
||||
(define
|
||||
skip-ws!
|
||||
(fn
|
||||
()
|
||||
(cond
|
||||
((>= pos src-len) nil)
|
||||
((ocaml-ws? (cur)) (begin (advance! 1) (skip-ws!)))
|
||||
((and (= (cur) "(") (= (ocaml-peek 1) "*"))
|
||||
(begin
|
||||
(advance! 2)
|
||||
(skip-block-comment! 1)
|
||||
(skip-ws!)))
|
||||
(else nil))))
|
||||
(define
|
||||
read-ident
|
||||
(fn
|
||||
(start)
|
||||
(begin
|
||||
(when
|
||||
(and (< pos src-len) (ocaml-ident-char? (cur)))
|
||||
(begin (advance! 1) (read-ident start)))
|
||||
(when
|
||||
(and (< pos src-len) (= (cur) "'"))
|
||||
(begin (advance! 1) (read-ident start)))
|
||||
(slice src start pos))))
|
||||
(define
|
||||
read-decimal-digits!
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and (< pos src-len) (or (ocaml-digit? (cur)) (= (cur) "_")))
|
||||
(begin (advance! 1) (read-decimal-digits!)))))
|
||||
(define
|
||||
read-hex-digits!
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and
|
||||
(< pos src-len)
|
||||
(or (ocaml-hex-digit? (cur)) (= (cur) "_")))
|
||||
(begin (advance! 1) (read-hex-digits!)))))
|
||||
(define
|
||||
read-exp-part!
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(and (< pos src-len) (or (= (cur) "e") (= (cur) "E")))
|
||||
(let
|
||||
((p1 (ocaml-peek 1)))
|
||||
(when
|
||||
(or
|
||||
(and (not (= p1 nil)) (ocaml-digit? p1))
|
||||
(and
|
||||
(or (= p1 "+") (= p1 "-"))
|
||||
(< (+ pos 2) src-len)
|
||||
(ocaml-digit? (ocaml-peek 2))))
|
||||
(begin
|
||||
(advance! 1)
|
||||
(when
|
||||
(and
|
||||
(< pos src-len)
|
||||
(or (= (cur) "+") (= (cur) "-")))
|
||||
(advance! 1))
|
||||
(read-decimal-digits!)))))))
|
||||
(define
|
||||
strip-underscores
|
||||
(fn
|
||||
(s)
|
||||
(let
|
||||
((out (list)) (i 0) (n (len s)))
|
||||
(begin
|
||||
(define
|
||||
loop
|
||||
(fn
|
||||
()
|
||||
(when
|
||||
(< i n)
|
||||
(begin
|
||||
(when
|
||||
(not (= (nth s i) "_"))
|
||||
(append! out (nth s i)))
|
||||
(set! i (+ i 1))
|
||||
(loop)))))
|
||||
(loop)
|
||||
(join "" out)))))
|
||||
(define
|
||||
read-number
|
||||
(fn
|
||||
(start)
|
||||
(cond
|
||||
((and (= (cur) "0") (< (+ pos 1) src-len) (or (= (ocaml-peek 1) "x") (= (ocaml-peek 1) "X")))
|
||||
(begin
|
||||
(advance! 2)
|
||||
(read-hex-digits!)
|
||||
(let
|
||||
((raw (slice src (+ start 2) pos)))
|
||||
(parse-number (str "0x" (strip-underscores raw))))))
|
||||
(else
|
||||
(begin
|
||||
(read-decimal-digits!)
|
||||
(when
|
||||
(and
|
||||
(< pos src-len)
|
||||
(= (cur) ".")
|
||||
(or
|
||||
(>= (+ pos 1) src-len)
|
||||
(not (= (ocaml-peek 1) "."))))
|
||||
(begin (advance! 1) (read-decimal-digits!)))
|
||||
(read-exp-part!)
|
||||
(parse-number (strip-underscores (slice src start pos))))))))
|
||||
(define
|
||||
read-string-literal
|
||||
(fn
|
||||
()
|
||||
(let
|
||||
((chars (list)))
|
||||
(begin
|
||||
(advance! 1)
|
||||
(define
|
||||
loop
|
||||
(fn
|
||||
()
|
||||
(cond
|
||||
((>= pos src-len) nil)
|
||||
((= (cur) "\\")
|
||||
(begin
|
||||
(advance! 1)
|
||||
(when
|
||||
(< pos src-len)
|
||||
(let
|
||||
((ch (cur)))
|
||||
(begin
|
||||
(cond
|
||||
((= ch "n") (append! chars "\n"))
|
||||
((= ch "t") (append! chars "\t"))
|
||||
((= ch "r") (append! chars "\r"))
|
||||
((= ch "b") (append! chars "\\b"))
|
||||
((= ch "\\") (append! chars "\\"))
|
||||
((= ch "'") (append! chars "'"))
|
||||
((= ch "\"") (append! chars "\""))
|
||||
((= ch " ") nil)
|
||||
(else (append! chars ch)))
|
||||
(advance! 1))))
|
||||
(loop)))
|
||||
((= (cur) "\"") (advance! 1))
|
||||
(else
|
||||
(begin
|
||||
(append! chars (cur))
|
||||
(advance! 1)
|
||||
(loop))))))
|
||||
(loop)
|
||||
(join "" chars)))))
|
||||
(define
|
||||
read-char-literal
|
||||
(fn
|
||||
()
|
||||
(begin
|
||||
(advance! 1)
|
||||
(let
|
||||
((value (cond ((= (cur) "\\") (begin (advance! 1) (let ((ch (cur))) (begin (advance! 1) (cond ((= ch "n") "\n") ((= ch "t") "\t") ((= ch "r") "\r") ((= ch "b") "\\b") ((= ch "\\") "\\") ((= ch "'") "'") ((= ch "\"") "\"") (else ch)))))) (else (let ((ch (cur))) (begin (advance! 1) ch))))))
|
||||
(begin
|
||||
(when
|
||||
(and (< pos src-len) (= (cur) "'"))
|
||||
(advance! 1))
|
||||
value)))))
|
||||
(define
|
||||
try-punct
|
||||
(fn
|
||||
(start)
|
||||
(let
|
||||
((c (cur))
|
||||
(c1 (ocaml-peek 1))
|
||||
(c2 (ocaml-peek 2)))
|
||||
(cond
|
||||
((and (= c ";") (= c1 ";"))
|
||||
(begin (advance! 2) (push! "op" ";;" start) true))
|
||||
((and (= c "-") (= c1 ">"))
|
||||
(begin (advance! 2) (push! "op" "->" start) true))
|
||||
((and (= c "<") (= c1 "-"))
|
||||
(begin (advance! 2) (push! "op" "<-" start) true))
|
||||
((and (= c ":") (= c1 "="))
|
||||
(begin (advance! 2) (push! "op" ":=" start) true))
|
||||
((and (= c ":") (= c1 ":"))
|
||||
(begin (advance! 2) (push! "op" "::" start) true))
|
||||
((and (= c "|") (= c1 "|"))
|
||||
(begin (advance! 2) (push! "op" "||" start) true))
|
||||
((and (= c "&") (= c1 "&"))
|
||||
(begin (advance! 2) (push! "op" "&&" start) true))
|
||||
((and (= c "<") (= c1 "="))
|
||||
(begin (advance! 2) (push! "op" "<=" start) true))
|
||||
((and (= c ">") (= c1 "="))
|
||||
(begin (advance! 2) (push! "op" ">=" start) true))
|
||||
((and (= c "<") (= c1 ">"))
|
||||
(begin (advance! 2) (push! "op" "<>" start) true))
|
||||
((and (= c "=") (= c1 "="))
|
||||
(begin (advance! 2) (push! "op" "==" start) true))
|
||||
((and (= c "!") (= c1 "="))
|
||||
(begin (advance! 2) (push! "op" "!=" start) true))
|
||||
((and (= c "|") (= c1 ">"))
|
||||
(begin (advance! 2) (push! "op" "|>" start) true))
|
||||
((and (= c "<") (= c1 "|"))
|
||||
(begin (advance! 2) (push! "op" "<|" start) true))
|
||||
((and (= c "@") (= c1 "@"))
|
||||
(begin (advance! 2) (push! "op" "@@" start) true))
|
||||
((and (= c "*") (= c1 "*"))
|
||||
(begin (advance! 2) (push! "op" "**" start) true))
|
||||
((or (= c "+") (= c "-") (= c "*") (= c "/") (= c "%") (= c "^") (= c "<") (= c ">") (= c "=") (= c "(") (= c ")") (= c "{") (= c "}") (= c "[") (= c "]") (= c ";") (= c ":") (= c ",") (= c ".") (= c "|") (= c "!") (= c "&") (= c "@") (= c "?") (= c "~") (= c "#"))
|
||||
(begin (advance! 1) (push! "op" c start) true))
|
||||
(else false)))))
|
||||
(define
|
||||
step
|
||||
(fn
|
||||
()
|
||||
(begin
|
||||
(skip-ws!)
|
||||
(when
|
||||
(< pos src-len)
|
||||
(let
|
||||
((start pos) (c (cur)))
|
||||
(cond
|
||||
((ocaml-ident-start? c)
|
||||
(let
|
||||
((word (read-ident start)))
|
||||
(begin
|
||||
(cond
|
||||
((ocaml-keyword? word)
|
||||
(push! "keyword" word start))
|
||||
((ocaml-upper? c) (push! "ctor" word start))
|
||||
(else (push! "ident" word start)))
|
||||
(step))))
|
||||
((ocaml-digit? c)
|
||||
(let
|
||||
((v (read-number start)))
|
||||
(begin (push! "number" v start) (step))))
|
||||
((= c "\"")
|
||||
(let
|
||||
((s (read-string-literal)))
|
||||
(begin (push! "string" s start) (step))))
|
||||
((and (= c "'") (< (+ pos 1) src-len) (or (and (= (ocaml-peek 1) "\\") (< (+ pos 3) src-len) (= (ocaml-peek 3) "'")) (and (not (= (ocaml-peek 1) "\\")) (< (+ pos 2) src-len) (= (ocaml-peek 2) "'"))))
|
||||
(let
|
||||
((v (read-char-literal)))
|
||||
(begin (push! "char" v start) (step))))
|
||||
((= c "'")
|
||||
(begin
|
||||
(advance! 1)
|
||||
(when
|
||||
(and (< pos src-len) (ocaml-ident-start? (cur)))
|
||||
(begin
|
||||
(advance! 1)
|
||||
(read-ident (+ start 1))))
|
||||
(push!
|
||||
"tyvar"
|
||||
(slice src (+ start 1) pos)
|
||||
start)
|
||||
(step)))
|
||||
((try-punct start) (step))
|
||||
(else
|
||||
(error
|
||||
(str "ocaml-tokenize: unexpected char " c " at " pos)))))))))
|
||||
(step)
|
||||
(push! "eof" nil pos)
|
||||
tokens)))
|
||||
Reference in New Issue
Block a user