Files
rose-ash/lib/ocaml/tokenizer.sx
giles ee002f2e02
Some checks failed
Test, Build, and Deploy / test-build-deploy (push) Failing after 27s
ocaml: phase 1/5/6 float arithmetic +./-./*./. (+5 tests, 372 total)
Tokenizer: +. -. *. /. (with -. avoiding clash with negative float
literals). Parser table places dotted ops at int-precedence levels.
Eval routes to host SX +/-/*//. HM types them Float->Float->Float;
literal floats now infer as Float (was Int).

OCaml-style 1.5 +. 2.5 : Float works end-to-end through tokenize +
parse + eval + infer.
2026-05-08 13:55:04 +00:00

391 lines
13 KiB
Plaintext

;; lib/ocaml/tokenizer.sx — OCaml lexer.
;;
;; Tokens: ident, ctor (uppercase ident), keyword, number, string, char, op, eof.
;; Token shape: {:type :value :pos} via lex-make-token.
;; OCaml is not indentation-sensitive — no layout pass.
;; Block comments (* ... *) nest. There is no line-comment syntax.
(prefix-rename
"ocaml-"
(quote
((make-token lex-make-token)
(digit? lex-digit?)
(hex-digit? lex-hex-digit?)
(alpha? lex-alpha?)
(alnum? lex-alnum?)
(ident-start? lex-ident-start?)
(ident-char? lex-ident-char?)
(ws? lex-whitespace?))))
(define
ocaml-keywords
(list
"and"
"as"
"assert"
"begin"
"class"
"constraint"
"do"
"done"
"downto"
"else"
"end"
"exception"
"external"
"false"
"for"
"fun"
"function"
"functor"
"if"
"in"
"include"
"inherit"
"initializer"
"lazy"
"let"
"match"
"method"
"module"
"mutable"
"new"
"nonrec"
"object"
"of"
"open"
"or"
"private"
"rec"
"sig"
"struct"
"then"
"to"
"true"
"try"
"type"
"val"
"virtual"
"when"
"while"
"with"
"land"
"lor"
"lxor"
"lsl"
"lsr"
"asr"
"mod"))
(define ocaml-keyword? (fn (word) (contains? ocaml-keywords word)))
(define
ocaml-upper?
(fn (c) (and (not (= c nil)) (>= c "A") (<= c "Z"))))
(define
ocaml-tokenize
(fn
(src)
(let
((tokens (list)) (pos 0) (src-len (len src)))
(define
ocaml-peek
(fn
(offset)
(if (< (+ pos offset) src-len) (nth src (+ pos offset)) nil)))
(define cur (fn () (ocaml-peek 0)))
(define advance! (fn (n) (set! pos (+ pos n))))
(define
push!
(fn
(type value start)
(append! tokens (ocaml-make-token type value start))))
(define
skip-block-comment!
(fn
(depth)
(cond
((>= pos src-len) nil)
((and (= (cur) "*") (= (ocaml-peek 1) ")"))
(begin
(advance! 2)
(when
(> depth 1)
(skip-block-comment! (- depth 1)))))
((and (= (cur) "(") (= (ocaml-peek 1) "*"))
(begin
(advance! 2)
(skip-block-comment! (+ depth 1))))
(else (begin (advance! 1) (skip-block-comment! depth))))))
(define
skip-ws!
(fn
()
(cond
((>= pos src-len) nil)
((ocaml-ws? (cur)) (begin (advance! 1) (skip-ws!)))
((and (= (cur) "(") (= (ocaml-peek 1) "*"))
(begin
(advance! 2)
(skip-block-comment! 1)
(skip-ws!)))
(else nil))))
(define
read-ident
(fn
(start)
(begin
(when
(and (< pos src-len) (ocaml-ident-char? (cur)))
(begin (advance! 1) (read-ident start)))
(when
(and (< pos src-len) (= (cur) "'"))
(begin (advance! 1) (read-ident start)))
(slice src start pos))))
(define
read-decimal-digits!
(fn
()
(when
(and (< pos src-len) (or (ocaml-digit? (cur)) (= (cur) "_")))
(begin (advance! 1) (read-decimal-digits!)))))
(define
read-hex-digits!
(fn
()
(when
(and
(< pos src-len)
(or (ocaml-hex-digit? (cur)) (= (cur) "_")))
(begin (advance! 1) (read-hex-digits!)))))
(define
read-exp-part!
(fn
()
(when
(and (< pos src-len) (or (= (cur) "e") (= (cur) "E")))
(let
((p1 (ocaml-peek 1)))
(when
(or
(and (not (= p1 nil)) (ocaml-digit? p1))
(and
(or (= p1 "+") (= p1 "-"))
(< (+ pos 2) src-len)
(ocaml-digit? (ocaml-peek 2))))
(begin
(advance! 1)
(when
(and
(< pos src-len)
(or (= (cur) "+") (= (cur) "-")))
(advance! 1))
(read-decimal-digits!)))))))
(define
strip-underscores
(fn
(s)
(let
((out (list)) (i 0) (n (len s)))
(begin
(define
loop
(fn
()
(when
(< i n)
(begin
(when
(not (= (nth s i) "_"))
(append! out (nth s i)))
(set! i (+ i 1))
(loop)))))
(loop)
(join "" out)))))
(define
read-number
(fn
(start)
(cond
((and (= (cur) "0") (< (+ pos 1) src-len) (or (= (ocaml-peek 1) "x") (= (ocaml-peek 1) "X")))
(begin
(advance! 2)
(read-hex-digits!)
(let
((raw (slice src (+ start 2) pos)))
(parse-number (str "0x" (strip-underscores raw))))))
(else
(begin
(read-decimal-digits!)
(when
(and
(< pos src-len)
(= (cur) ".")
(or
(>= (+ pos 1) src-len)
(not (= (ocaml-peek 1) "."))))
(begin (advance! 1) (read-decimal-digits!)))
(read-exp-part!)
(parse-number (strip-underscores (slice src start pos))))))))
(define
read-string-literal
(fn
()
(let
((chars (list)))
(begin
(advance! 1)
(define
loop
(fn
()
(cond
((>= pos src-len) nil)
((= (cur) "\\")
(begin
(advance! 1)
(when
(< pos src-len)
(let
((ch (cur)))
(begin
(cond
((= ch "n") (append! chars "\n"))
((= ch "t") (append! chars "\t"))
((= ch "r") (append! chars "\r"))
((= ch "b") (append! chars "\\b"))
((= ch "\\") (append! chars "\\"))
((= ch "'") (append! chars "'"))
((= ch "\"") (append! chars "\""))
((= ch " ") nil)
(else (append! chars ch)))
(advance! 1))))
(loop)))
((= (cur) "\"") (advance! 1))
(else
(begin
(append! chars (cur))
(advance! 1)
(loop))))))
(loop)
(join "" chars)))))
(define
read-char-literal
(fn
()
(begin
(advance! 1)
(let
((value (cond ((= (cur) "\\") (begin (advance! 1) (let ((ch (cur))) (begin (advance! 1) (cond ((= ch "n") "\n") ((= ch "t") "\t") ((= ch "r") "\r") ((= ch "b") "\\b") ((= ch "\\") "\\") ((= ch "'") "'") ((= ch "\"") "\"") (else ch)))))) (else (let ((ch (cur))) (begin (advance! 1) ch))))))
(begin
(when
(and (< pos src-len) (= (cur) "'"))
(advance! 1))
value)))))
(define
try-punct
(fn
(start)
(let
((c (cur))
(c1 (ocaml-peek 1))
(c2 (ocaml-peek 2)))
(cond
((and (= c ";") (= c1 ";"))
(begin (advance! 2) (push! "op" ";;" start) true))
((and (= c "+") (= c1 "."))
(begin (advance! 2) (push! "op" "+." start) true))
((and (= c "-") (= c1 ".") (not (and (not (= c2 nil)) (ocaml-digit? c2))))
(begin (advance! 2) (push! "op" "-." start) true))
((and (= c "*") (= c1 "."))
(begin (advance! 2) (push! "op" "*." start) true))
((and (= c "/") (= c1 "."))
(begin (advance! 2) (push! "op" "/." start) true))
((and (= c "-") (= c1 ">"))
(begin (advance! 2) (push! "op" "->" start) true))
((and (= c "<") (= c1 "-"))
(begin (advance! 2) (push! "op" "<-" start) true))
((and (= c ":") (= c1 "="))
(begin (advance! 2) (push! "op" ":=" start) true))
((and (= c ":") (= c1 ":"))
(begin (advance! 2) (push! "op" "::" start) true))
((and (= c "|") (= c1 "|"))
(begin (advance! 2) (push! "op" "||" start) true))
((and (= c "&") (= c1 "&"))
(begin (advance! 2) (push! "op" "&&" start) true))
((and (= c "<") (= c1 "="))
(begin (advance! 2) (push! "op" "<=" start) true))
((and (= c ">") (= c1 "="))
(begin (advance! 2) (push! "op" ">=" start) true))
((and (= c "<") (= c1 ">"))
(begin (advance! 2) (push! "op" "<>" start) true))
((and (= c "=") (= c1 "="))
(begin (advance! 2) (push! "op" "==" start) true))
((and (= c "!") (= c1 "="))
(begin (advance! 2) (push! "op" "!=" start) true))
((and (= c "|") (= c1 ">"))
(begin (advance! 2) (push! "op" "|>" start) true))
((and (= c "<") (= c1 "|"))
(begin (advance! 2) (push! "op" "<|" start) true))
((and (= c "@") (= c1 "@"))
(begin (advance! 2) (push! "op" "@@" start) true))
((and (= c "*") (= c1 "*"))
(begin (advance! 2) (push! "op" "**" start) true))
((or (= c "+") (= c "-") (= c "*") (= c "/") (= c "%") (= c "^") (= c "<") (= c ">") (= c "=") (= c "(") (= c ")") (= c "{") (= c "}") (= c "[") (= c "]") (= c ";") (= c ":") (= c ",") (= c ".") (= c "|") (= c "!") (= c "&") (= c "@") (= c "?") (= c "~") (= c "#"))
(begin (advance! 1) (push! "op" c start) true))
(else false)))))
(define
step
(fn
()
(begin
(skip-ws!)
(when
(< pos src-len)
(let
((start pos) (c (cur)))
(cond
((ocaml-ident-start? c)
(let
((word (read-ident start)))
(begin
(cond
((ocaml-keyword? word)
(push! "keyword" word start))
((ocaml-upper? c) (push! "ctor" word start))
(else (push! "ident" word start)))
(step))))
((ocaml-digit? c)
(let
((v (read-number start)))
(begin (push! "number" v start) (step))))
((= c "\"")
(let
((s (read-string-literal)))
(begin (push! "string" s start) (step))))
((and (= c "'") (< (+ pos 1) src-len) (or (and (= (ocaml-peek 1) "\\") (< (+ pos 3) src-len) (= (ocaml-peek 3) "'")) (and (not (= (ocaml-peek 1) "\\")) (< (+ pos 2) src-len) (= (ocaml-peek 2) "'"))))
(let
((v (read-char-literal)))
(begin (push! "char" v start) (step))))
((= c "'")
(begin
(advance! 1)
(when
(and (< pos src-len) (ocaml-ident-start? (cur)))
(begin
(advance! 1)
(read-ident (+ start 1))))
(push!
"tyvar"
(slice src (+ start 1) pos)
start)
(step)))
((try-punct start) (step))
(else
(error
(str "ocaml-tokenize: unexpected char " c " at " pos)))))))))
(step)
(push! "eof" nil pos)
tokens)))