Files
rose-ash/lib/ocaml/tokenizer.sx
giles 9f539ab392
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 46s
ocaml: phase 3 polymorphic variants (+4 tests, 382 total)
Tokenizer recognises backtick followed by an upper ident, emitting a
ctor token identical to a nominal ctor. Parser and evaluator treat
polyvariants as ctors — same tagged-list runtime. So:

  `Red                   -> ("Red")
  `Some 42               -> ("Some" 42)
  match `Red with | `Red -> 1 | `Green -> 2 | `Blue -> 3
                          -> 1
  `Pair (1,2)            -> ("Pair" 1 2) (with tuple-arg flatten)

Proper row types in HM deferred.
2026-05-08 14:03:09 +00:00

405 lines
13 KiB
Plaintext

;; lib/ocaml/tokenizer.sx — OCaml lexer.
;;
;; Tokens: ident, ctor (uppercase ident), keyword, number, string, char, op, eof.
;; Token shape: {:type :value :pos} via lex-make-token.
;; OCaml is not indentation-sensitive — no layout pass.
;; Block comments (* ... *) nest. There is no line-comment syntax.
(prefix-rename
"ocaml-"
(quote
((make-token lex-make-token)
(digit? lex-digit?)
(hex-digit? lex-hex-digit?)
(alpha? lex-alpha?)
(alnum? lex-alnum?)
(ident-start? lex-ident-start?)
(ident-char? lex-ident-char?)
(ws? lex-whitespace?))))
(define
ocaml-keywords
(list
"and"
"as"
"assert"
"begin"
"class"
"constraint"
"do"
"done"
"downto"
"else"
"end"
"exception"
"external"
"false"
"for"
"fun"
"function"
"functor"
"if"
"in"
"include"
"inherit"
"initializer"
"lazy"
"let"
"match"
"method"
"module"
"mutable"
"new"
"nonrec"
"object"
"of"
"open"
"or"
"private"
"rec"
"sig"
"struct"
"then"
"to"
"true"
"try"
"type"
"val"
"virtual"
"when"
"while"
"with"
"land"
"lor"
"lxor"
"lsl"
"lsr"
"asr"
"mod"))
(define ocaml-keyword? (fn (word) (contains? ocaml-keywords word)))
(define
ocaml-upper?
(fn (c) (and (not (= c nil)) (>= c "A") (<= c "Z"))))
(define
ocaml-tokenize
(fn
(src)
(let
((tokens (list)) (pos 0) (src-len (len src)))
(define
ocaml-peek
(fn
(offset)
(if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil)))
(define cur (fn () (ocaml-peek 0)))
(define advance! (fn (n) (set! pos (+ pos n))))
(define
push!
(fn
(type value start)
(append! tokens (ocaml-make-token type value start))))
(define
skip-block-comment!
(fn
(depth)
(cond
((>= pos src-len) nil)
((and (= (cur) "*") (= (ocaml-peek 1) ")"))
(begin
(advance! 2)
(when
(> depth 1)
(skip-block-comment! (- depth 1)))))
((and (= (cur) "(") (= (ocaml-peek 1) "*"))
(begin
(advance! 2)
(skip-block-comment! (+ depth 1))))
(else (begin (advance! 1) (skip-block-comment! depth))))))
(define
skip-ws!
(fn
()
(cond
((>= pos src-len) nil)
((ocaml-ws? (cur)) (begin (advance! 1) (skip-ws!)))
((and (= (cur) "(") (= (ocaml-peek 1) "*"))
(begin
(advance! 2)
(skip-block-comment! 1)
(skip-ws!)))
(else nil))))
(define
read-ident
(fn
(start)
(begin
(when
(and (< pos src-len) (ocaml-ident-char? (cur)))
(begin (advance! 1) (read-ident start)))
(when
(and (< pos src-len) (= (cur) "'"))
(begin (advance! 1) (read-ident start)))
(slice src start pos))))
(define
read-decimal-digits!
(fn
()
(when
(and (< pos src-len) (or (ocaml-digit? (cur)) (= (cur) "_")))
(begin (advance! 1) (read-decimal-digits!)))))
(define
read-hex-digits!
(fn
()
(when
(and
(< pos src-len)
(or (ocaml-hex-digit? (cur)) (= (cur) "_")))
(begin (advance! 1) (read-hex-digits!)))))
(define
read-exp-part!
(fn
()
(when
(and (< pos src-len) (or (= (cur) "e") (= (cur) "E")))
(let
((p1 (ocaml-peek 1)))
(when
(or
(and (not (= p1 nil)) (ocaml-digit? p1))
(and
(or (= p1 "+") (= p1 "-"))
(< (+ pos 2) src-len)
(ocaml-digit? (ocaml-peek 2))))
(begin
(advance! 1)
(when
(and
(< pos src-len)
(or (= (cur) "+") (= (cur) "-")))
(advance! 1))
(read-decimal-digits!)))))))
(define
strip-underscores
(fn
(s)
(let
((out (list)) (i 0) (n (len s)))
(begin
(define
loop
(fn
()
(when
(< i n)
(begin
(when
(not (= (nth s i) "_"))
(append! out (nth s i)))
(set! i (+ i 1))
(loop)))))
(loop)
(join "" out)))))
(define
read-number
(fn
(start)
(cond
((and (= (cur) "0") (< (+ pos 1) src-len) (or (= (ocaml-peek 1) "x") (= (ocaml-peek 1) "X")))
(begin
(advance! 2)
(read-hex-digits!)
(let
((raw (slice src (+ start 2) pos)))
(parse-number (str "0x" (strip-underscores raw))))))
(else
(begin
(read-decimal-digits!)
(when
(and
(< pos src-len)
(= (cur) ".")
(or
(>= (+ pos 1) src-len)
(not (= (ocaml-peek 1) "."))))
(begin (advance! 1) (read-decimal-digits!)))
(read-exp-part!)
(parse-number (strip-underscores (slice src start pos))))))))
(define
read-string-literal
(fn
()
(let
((chars (list)))
(begin
(advance! 1)
(define
loop
(fn
()
(cond
((>= pos src-len) nil)
((= (cur) "\\")
(begin
(advance! 1)
(when
(< pos src-len)
(let
((ch (cur)))
(begin
(cond
((= ch "n") (append! chars "\n"))
((= ch "t") (append! chars "\t"))
((= ch "r") (append! chars "\r"))
((= ch "b") (append! chars "\\b"))
((= ch "\\") (append! chars "\\"))
((= ch "'") (append! chars "'"))
((= ch "\"") (append! chars "\""))
((= ch " ") nil)
(else (append! chars ch)))
(advance! 1))))
(loop)))
((= (cur) "\"") (advance! 1))
(else
(begin
(append! chars (cur))
(advance! 1)
(loop))))))
(loop)
(join "" chars)))))
(define
read-char-literal
(fn
()
(begin
(advance! 1)
(let
((value (cond ((= (cur) "\\") (begin (advance! 1) (let ((ch (cur))) (begin (advance! 1) (cond ((= ch "n") "\n") ((= ch "t") "\t") ((= ch "r") "\r") ((= ch "b") "\\b") ((= ch "\\") "\\") ((= ch "'") "'") ((= ch "\"") "\"") (else ch)))))) (else (let ((ch (cur))) (begin (advance! 1) ch))))))
(begin
(when
(and (< pos src-len) (= (cur) "'"))
(advance! 1))
value)))))
(define
try-punct
(fn
(start)
(let
((c (cur))
(c1 (ocaml-peek 1))
(c2 (ocaml-peek 2)))
(cond
((and (= c ";") (= c1 ";"))
(begin (advance! 2) (push! "op" ";;" start) true))
((and (= c "+") (= c1 "."))
(begin (advance! 2) (push! "op" "+." start) true))
((and (= c "-") (= c1 ".") (not (and (not (= c2 nil)) (ocaml-digit? c2))))
(begin (advance! 2) (push! "op" "-." start) true))
((and (= c "*") (= c1 "."))
(begin (advance! 2) (push! "op" "*." start) true))
((and (= c "/") (= c1 "."))
(begin (advance! 2) (push! "op" "/." start) true))
((and (= c "-") (= c1 ">"))
(begin (advance! 2) (push! "op" "->" start) true))
((and (= c "<") (= c1 "-"))
(begin (advance! 2) (push! "op" "<-" start) true))
((and (= c ":") (= c1 "="))
(begin (advance! 2) (push! "op" ":=" start) true))
((and (= c ":") (= c1 ":"))
(begin (advance! 2) (push! "op" "::" start) true))
((and (= c "|") (= c1 "|"))
(begin (advance! 2) (push! "op" "||" start) true))
((and (= c "&") (= c1 "&"))
(begin (advance! 2) (push! "op" "&&" start) true))
((and (= c "<") (= c1 "="))
(begin (advance! 2) (push! "op" "<=" start) true))
((and (= c ">") (= c1 "="))
(begin (advance! 2) (push! "op" ">=" start) true))
((and (= c "<") (= c1 ">"))
(begin (advance! 2) (push! "op" "<>" start) true))
((and (= c "=") (= c1 "="))
(begin (advance! 2) (push! "op" "==" start) true))
((and (= c "!") (= c1 "="))
(begin (advance! 2) (push! "op" "!=" start) true))
((and (= c "|") (= c1 ">"))
(begin (advance! 2) (push! "op" "|>" start) true))
((and (= c "<") (= c1 "|"))
(begin (advance! 2) (push! "op" "<|" start) true))
((and (= c "@") (= c1 "@"))
(begin (advance! 2) (push! "op" "@@" start) true))
((and (= c "*") (= c1 "*"))
(begin (advance! 2) (push! "op" "**" start) true))
((or (= c "+") (= c "-") (= c "*") (= c "/") (= c "%") (= c "^") (= c "<") (= c ">") (= c "=") (= c "(") (= c ")") (= c "{") (= c "}") (= c "[") (= c "]") (= c ";") (= c ":") (= c ",") (= c ".") (= c "|") (= c "!") (= c "&") (= c "@") (= c "?") (= c "~") (= c "#"))
(begin (advance! 1) (push! "op" c start) true))
(else false)))))
(define
step
(fn
()
(begin
(skip-ws!)
(when
(< pos src-len)
(let
((start pos) (c (cur)))
(cond
((ocaml-ident-start? c)
(let
((word (read-ident start)))
(begin
(cond
((ocaml-keyword? word)
(push! "keyword" word start))
((ocaml-upper? c) (push! "ctor" word start))
(else (push! "ident" word start)))
(step))))
((ocaml-digit? c)
(let
((v (read-number start)))
(begin (push! "number" v start) (step))))
((= c "\"")
(let
((s (read-string-literal)))
(begin (push! "string" s start) (step))))
((and (= c "'") (< (+ pos 1) src-len) (or (and (= (ocaml-peek 1) "\\") (< (+ pos 3) src-len) (= (ocaml-peek 3) "'")) (and (not (= (ocaml-peek 1) "\\")) (< (+ pos 2) src-len) (= (ocaml-peek 2) "'"))))
(let
((v (read-char-literal)))
(begin (push! "char" v start) (step))))
((= c "'")
(begin
(advance! 1)
(when
(and (< pos src-len) (ocaml-ident-start? (cur)))
(begin
(advance! 1)
(read-ident (+ start 1))))
(push!
"tyvar"
(slice src (+ start 1) pos)
start)
(step)))
;; Polymorphic variant tag: `Tag — emits a ctor token
;; identical to a nominal ctor. Runtime is dynamic, so
;; the distinction only matters for HM (deferred).
((and (= c "`")
(< (+ pos 1) src-len)
(ocaml-upper? (ocaml-peek 1)))
(begin
(advance! 1)
(let ((ctor-start pos))
(begin
(when (and (< pos src-len) (ocaml-ident-char? (cur)))
(begin (advance! 1) (read-ident ctor-start)))
(push! "ctor" (slice src ctor-start pos) start)))
(step)))
((try-punct start) (step))
(else
(error
(str "ocaml-tokenize: unexpected char " c " at " pos)))))))))
(step)
(push! "eof" nil pos)
tokens)))