diff --git a/lib/smalltalk/parser.sx b/lib/smalltalk/parser.sx index 657a854e..90639bf2 100644 --- a/lib/smalltalk/parser.sx +++ b/lib/smalltalk/parser.sx @@ -21,6 +21,185 @@ ;; ;; A "chunk" / class-definition stream is parsed at a higher level (deferred). +;; ── Chunk-stream reader ──────────────────────────────────────────────── +;; Pharo chunk format: chunks are separated by `!`. A doubled `!!` inside a +;; chunk represents a single literal `!`. Returns list of chunk strings with +;; surrounding whitespace trimmed. +(define + st-read-chunks + (fn + (src) + (let + ((chunks (list)) + (buf (list)) + (pos 0) + (n (len src))) + (begin + (define + flush! + (fn + () + (let + ((s (st-trim (join "" buf)))) + (begin (append! chunks s) (set! buf (list)))))) + (define + rc-loop + (fn + () + (when + (< pos n) + (let + ((c (nth src pos))) + (cond + ((= c "!") + (cond + ((and (< (+ pos 1) n) (= (nth src (+ pos 1)) "!")) + (begin (append! buf "!") (set! pos (+ pos 2)) (rc-loop))) + (else + (begin (flush!) (set! pos (+ pos 1)) (rc-loop))))) + (else + (begin (append! buf c) (set! pos (+ pos 1)) (rc-loop)))))))) + (rc-loop) + ;; trailing text without a closing `!` — preserve as a chunk + (when (> (len buf) 0) (flush!)) + chunks)))) + +(define + st-trim + (fn + (s) + (let + ((n (len s)) (i 0) (j 0)) + (begin + (set! j n) + (define + tl-loop + (fn + () + (when + (and (< i n) (st-trim-ws? (nth s i))) + (begin (set! i (+ i 1)) (tl-loop))))) + (tl-loop) + (define + tr-loop + (fn + () + (when + (and (> j i) (st-trim-ws? (nth s (- j 1)))) + (begin (set! j (- j 1)) (tr-loop))))) + (tr-loop) + (slice s i j))))) + +(define + st-trim-ws? + (fn (c) (or (= c " ") (= c "\t") (= c "\n") (= c "\r")))) + +;; Parse a chunk stream. Walks chunks and applies the Pharo file-in +;; convention: a chunk that evaluates to "X methodsFor: 'cat'" or +;; "X class methodsFor: 'cat'" enters a methods batch — subsequent chunks +;; are method source until an empty chunk closes the batch. +;; +;; Returns list of entries: +;; {:kind "expr" :ast EXPR-AST} +;; {:kind "method" :class CLS :class-side? BOOL :category CAT :ast METHOD-AST} +;; {:kind "blank"} (empty chunks outside a methods batch) +;; {:kind "end-methods"} (empty chunk closing a methods batch) +(define + st-parse-chunks + (fn + (src) + (let + ((chunks (st-read-chunks src)) + (entries (list)) + (mode "do-it") + (cls-name nil) + (class-side? false) + (category nil)) + (begin + (for-each + (fn + (chunk) + (cond + ((= chunk "") + (cond + ((= mode "methods") + (begin + (append! entries {:kind "end-methods"}) + (set! mode "do-it") + (set! cls-name nil) + (set! class-side? false) + (set! category nil))) + (else (append! entries {:kind "blank"})))) + ((= mode "methods") + (append! + entries + {:kind "method" + :class cls-name + :class-side? class-side? + :category category + :ast (st-parse-method chunk)})) + (else + (let + ((ast (st-parse-expr chunk))) + (begin + (append! entries {:kind "expr" :ast ast}) + (let + ((mf (st-detect-methods-for ast))) + (when + (not (= mf nil)) + (begin + (set! mode "methods") + (set! cls-name (get mf :class)) + (set! class-side? (get mf :class-side?)) + (set! category (get mf :category)))))))))) + chunks) + entries)))) + +;; Recognise `Foo methodsFor: 'cat'` (and related) as starting a methods batch. +;; Returns nil if the AST doesn't look like one of these forms. +(define + st-detect-methods-for + (fn + (ast) + (cond + ((not (= (get ast :type) "send")) nil) + ((not (st-is-methods-for-selector? (get ast :selector))) nil) + (else + (let + ((recv (get ast :receiver)) (args (get ast :args))) + (let + ((cat-arg (if (> (len args) 0) (nth args 0) nil))) + (let + ((category + (cond + ((= cat-arg nil) nil) + ((= (get cat-arg :type) "lit-string") (get cat-arg :value)) + ((= (get cat-arg :type) "lit-symbol") (get cat-arg :value)) + (else nil)))) + (cond + ((= (get recv :type) "ident") + {:class (get recv :name) + :class-side? false + :category category}) + ;; `Foo class methodsFor: 'cat'` — recv is a unary send `Foo class` + ((and + (= (get recv :type) "send") + (= (get recv :selector) "class") + (= (get (get recv :receiver) :type) "ident")) + {:class (get (get recv :receiver) :name) + :class-side? true + :category category}) + (else nil))))))))) + +(define + st-is-methods-for-selector? + (fn + (sel) + (or + (= sel "methodsFor:") + (= sel "methodsFor:stamp:") + (= sel "category:")))) + (define st-tok-type (fn (t) (if (= t nil) "eof" (get t :type)))) (define st-tok-value (fn (t) (if (= t nil) nil (get t :value)))) @@ -548,13 +727,81 @@ (u-loop) receiver)))) + ;; Parse a single pragma: `` + ;; Returns {:selector "primitive:" :args (list literal-asts)}. + (define + parse-pragma + (fn + () + (begin + (consume! "binary" "<") + (let + ((sel-parts (list)) (args (list))) + (begin + (define + pr-loop + (fn + () + (when + (at-type? "keyword") + (let ((t (peek-tok))) + (begin + (advance-tok!) + (append! sel-parts (st-tok-value t)) + (append! args (parse-pragma-arg)) + (pr-loop)))))) + (pr-loop) + (consume! "binary" ">") + {:selector (join "" sel-parts) :args args}))))) + + ;; Pragma arguments are literals only. + (define + parse-pragma-arg + (fn + () + (let + ((t (peek-tok))) + (let + ((ty (st-tok-type t)) (v (st-tok-value t))) + (cond + ((= ty "number") + (begin + (advance-tok!) + {:type (if (integer? v) "lit-int" "lit-float") :value v})) + ((= ty "string") (begin (advance-tok!) {:type "lit-string" :value v})) + ((= ty "char") (begin (advance-tok!) {:type "lit-char" :value v})) + ((= ty "symbol") (begin (advance-tok!) {:type "lit-symbol" :value v})) + ((= ty "ident") + (begin + (advance-tok!) + (cond + ((= v "nil") {:type "lit-nil"}) + ((= v "true") {:type "lit-true"}) + ((= v "false") {:type "lit-false"}) + (else (error (str "st-parse: pragma arg must be literal, got ident " v)))))) + ((and (= ty "binary") (= v "-") + (= (st-tok-type (peek-tok-at 1)) "number")) + (let ((n (st-tok-value (peek-tok-at 1)))) + (begin + (advance-tok!) + (advance-tok!) + {:type (if (integer? n) "lit-int" "lit-float") + :value (- 0 n)}))) + (else + (error + (str "st-parse: pragma arg must be literal, got " ty)))))))) + ;; Method header: unary | binary arg | (kw arg)+ (define parse-method (fn () (let - ((sel "") (params (list)) (temps (list)) (body (list))) + ((sel "") + (params (list)) + (temps (list)) + (pragmas (list)) + (body (list))) (begin (cond ;; Unary header @@ -593,30 +840,47 @@ (str "st-parse-method: expected selector header, got " (st-tok-type (peek-tok)))))) - ;; Optional temps: | t1 t2 | - (when - (at? "bar" nil) - (begin - (advance-tok!) - (define - th-loop - (fn - () - (when - (at-type? "ident") - (let ((t (peek-tok))) - (begin - (advance-tok!) - (append! temps (st-tok-value t)) - (th-loop)))))) - (th-loop) - (consume! "bar" nil))) + ;; Pragmas and temps may appear in either order. Allow many + ;; pragmas; one temps section. + (define + parse-temps! + (fn + () + (begin + (advance-tok!) + (define + th-loop + (fn + () + (when + (at-type? "ident") + (let ((t (peek-tok))) + (begin + (advance-tok!) + (append! temps (st-tok-value t)) + (th-loop)))))) + (th-loop) + (consume! "bar" nil)))) + (define + pt-loop + (fn + () + (cond + ((and + (at? "binary" "<") + (= (st-tok-type (peek-tok-at 1)) "keyword")) + (begin (append! pragmas (parse-pragma)) (pt-loop))) + ((and (at? "bar" nil) (= (len temps) 0)) + (begin (parse-temps!) (pt-loop))) + (else nil)))) + (pt-loop) ;; Body statements (set! body (parse-statements "eof")) {:type "method" :selector sel :params params :temps temps + :pragmas pragmas :body body})))) ;; Top-level program: statements separated by '.' diff --git a/lib/smalltalk/tests/parse.sx b/lib/smalltalk/tests/parse.sx index edf3419a..9ce86338 100644 --- a/lib/smalltalk/tests/parse.sx +++ b/lib/smalltalk/tests/parse.sx @@ -314,6 +314,7 @@ :selector "factorial" :params (list) :temps (list) + :pragmas (list) :body (list {:type "return" :expr {:type "send" @@ -335,6 +336,7 @@ :selector "+" :params (list "other") :temps (list) + :pragmas (list) :body (list {:type "return" :expr {:type "lit-string" :value "plus"}})}) (st-test @@ -344,6 +346,7 @@ :selector "at:put:" :params (list "i" "v") :temps (list) + :pragmas (list) :body (list {:type "return" :expr {:type "ident" :name "v"}})}) (st-test @@ -353,6 +356,7 @@ :selector "twice:" :params (list "x") :temps (list "t") + :pragmas (list) :body (list {:type "assign" :name "t" diff --git a/lib/smalltalk/tests/parse_chunks.sx b/lib/smalltalk/tests/parse_chunks.sx new file mode 100644 index 00000000..e46d9884 --- /dev/null +++ b/lib/smalltalk/tests/parse_chunks.sx @@ -0,0 +1,294 @@ +;; Smalltalk chunk-stream parser + pragma tests. +;; +;; Reuses helpers (st-test, st-deep=?) from tokenize.sx. Counters reset +;; here so this file's summary covers chunk + pragma tests only. + +(set! st-test-pass 0) +(set! st-test-fail 0) +(set! st-test-fails (list)) + +;; ── 1. Raw chunk reader ── +(st-test "empty source" (st-read-chunks "") (list)) +(st-test "single chunk" (st-read-chunks "foo!") (list "foo")) +(st-test "two chunks" (st-read-chunks "a! b!") (list "a" "b")) +(st-test "trailing no bang" (st-read-chunks "a! b") (list "a" "b")) +(st-test "empty chunk" (st-read-chunks "a! ! b!") (list "a" "" "b")) +(st-test + "doubled bang escapes" + (st-read-chunks "yes!! no!yes!") + (list "yes! no" "yes")) +(st-test + "whitespace trimmed" + (st-read-chunks " \n hello \n !") + (list "hello")) + +;; ── 2. Chunk parser — do-it mode ── +(st-test + "single do-it chunk" + (st-parse-chunks "1 + 2!") + (list + {:kind "expr" + :ast {:type "send" + :receiver {:type "lit-int" :value 1} + :selector "+" + :args (list {:type "lit-int" :value 2})}})) + +(st-test + "two do-it chunks" + (st-parse-chunks "x := 1! y := 2!") + (list + {:kind "expr" + :ast {:type "assign" :name "x" :expr {:type "lit-int" :value 1}}} + {:kind "expr" + :ast {:type "assign" :name "y" :expr {:type "lit-int" :value 2}}})) + +(st-test + "blank chunk outside methods" + (st-parse-chunks "1! ! 2!") + (list + {:kind "expr" :ast {:type "lit-int" :value 1}} + {:kind "blank"} + {:kind "expr" :ast {:type "lit-int" :value 2}})) + +;; ── 3. Methods batch ── +(st-test + "methodsFor opens method batch" + (st-parse-chunks + "Foo methodsFor: 'access'! foo ^ 1! bar ^ 2! !") + (list + {:kind "expr" + :ast {:type "send" + :receiver {:type "ident" :name "Foo"} + :selector "methodsFor:" + :args (list {:type "lit-string" :value "access"})}} + {:kind "method" + :class "Foo" + :class-side? false + :category "access" + :ast {:type "method" + :selector "foo" + :params (list) + :temps (list) + :pragmas (list) + :body (list + {:type "return" :expr {:type "lit-int" :value 1}})}} + {:kind "method" + :class "Foo" + :class-side? false + :category "access" + :ast {:type "method" + :selector "bar" + :params (list) + :temps (list) + :pragmas (list) + :body (list + {:type "return" :expr {:type "lit-int" :value 2}})}} + {:kind "end-methods"})) + +(st-test + "class-side methodsFor" + (st-parse-chunks + "Foo class methodsFor: 'creation'! make ^ self new! !") + (list + {:kind "expr" + :ast {:type "send" + :receiver {:type "send" + :receiver {:type "ident" :name "Foo"} + :selector "class" + :args (list)} + :selector "methodsFor:" + :args (list {:type "lit-string" :value "creation"})}} + {:kind "method" + :class "Foo" + :class-side? true + :category "creation" + :ast {:type "method" + :selector "make" + :params (list) + :temps (list) + :pragmas (list) + :body (list + {:type "return" + :expr {:type "send" + :receiver {:type "self"} + :selector "new" + :args (list)}})}} + {:kind "end-methods"})) + +(st-test + "method batch returns to do-it after empty chunk" + (st-parse-chunks + "Foo methodsFor: 'a'! m1 ^ 1! ! 99!") + (list + {:kind "expr" + :ast {:type "send" + :receiver {:type "ident" :name "Foo"} + :selector "methodsFor:" + :args (list {:type "lit-string" :value "a"})}} + {:kind "method" + :class "Foo" + :class-side? false + :category "a" + :ast {:type "method" + :selector "m1" + :params (list) + :temps (list) + :pragmas (list) + :body (list + {:type "return" :expr {:type "lit-int" :value 1}})}} + {:kind "end-methods"} + {:kind "expr" :ast {:type "lit-int" :value 99}})) + +;; ── 4. Pragmas in method bodies ── +(st-test + "single pragma" + (st-parse-method "primAt: i ^ self") + {:type "method" + :selector "primAt:" + :params (list "i") + :temps (list) + :pragmas (list + {:selector "primitive:" + :args (list {:type "lit-int" :value 60})}) + :body (list {:type "return" :expr {:type "self"}})}) + +(st-test + "pragma with two keyword pairs" + (st-parse-method "fft ^ nil") + {:type "method" + :selector "fft" + :params (list) + :temps (list) + :pragmas (list + {:selector "primitive:module:" + :args (list + {:type "lit-int" :value 1} + {:type "lit-string" :value "fft"})}) + :body (list {:type "return" :expr {:type "lit-nil"}})}) + +(st-test + "pragma with negative number" + (st-parse-method "neg ^ nil") + {:type "method" + :selector "neg" + :params (list) + :temps (list) + :pragmas (list + {:selector "primitive:" + :args (list {:type "lit-int" :value -1})}) + :body (list {:type "return" :expr {:type "lit-nil"}})}) + +(st-test + "pragma with symbol arg" + (st-parse-method "tagged ^ nil") + {:type "method" + :selector "tagged" + :params (list) + :temps (list) + :pragmas (list + {:selector "category:" + :args (list {:type "lit-symbol" :value "algebra"})}) + :body (list {:type "return" :expr {:type "lit-nil"}})}) + +(st-test + "pragma then temps" + (st-parse-method "calc | t | t := 5. ^ t") + {:type "method" + :selector "calc" + :params (list) + :temps (list "t") + :pragmas (list + {:selector "primitive:" + :args (list {:type "lit-int" :value 1})}) + :body (list + {:type "assign" :name "t" :expr {:type "lit-int" :value 5}} + {:type "return" :expr {:type "ident" :name "t"}})}) + +(st-test + "temps then pragma" + (st-parse-method "calc | t | t := 5. ^ t") + {:type "method" + :selector "calc" + :params (list) + :temps (list "t") + :pragmas (list + {:selector "primitive:" + :args (list {:type "lit-int" :value 1})}) + :body (list + {:type "assign" :name "t" :expr {:type "lit-int" :value 5}} + {:type "return" :expr {:type "ident" :name "t"}})}) + +(st-test + "two pragmas" + (st-parse-method "m ^ self") + {:type "method" + :selector "m" + :params (list) + :temps (list) + :pragmas (list + {:selector "primitive:" + :args (list {:type "lit-int" :value 1})} + {:selector "category:" + :args (list {:type "lit-string" :value "a"})}) + :body (list {:type "return" :expr {:type "self"}})}) + +;; ── 5. End-to-end: a small "filed-in" snippet ── +(st-test + "small filed-in class snippet" + (st-parse-chunks + "Object subclass: #Account + instanceVariableNames: 'balance'! + + !Account methodsFor: 'access'! + balance + ^ balance! + + deposit: amount + balance := balance + amount. + ^ self! !") + (list + {:kind "expr" + :ast {:type "send" + :receiver {:type "ident" :name "Object"} + :selector "subclass:instanceVariableNames:" + :args (list + {:type "lit-symbol" :value "Account"} + {:type "lit-string" :value "balance"})}} + {:kind "blank"} + {:kind "expr" + :ast {:type "send" + :receiver {:type "ident" :name "Account"} + :selector "methodsFor:" + :args (list {:type "lit-string" :value "access"})}} + {:kind "method" + :class "Account" + :class-side? false + :category "access" + :ast {:type "method" + :selector "balance" + :params (list) + :temps (list) + :pragmas (list) + :body (list + {:type "return" + :expr {:type "ident" :name "balance"}})}} + {:kind "method" + :class "Account" + :class-side? false + :category "access" + :ast {:type "method" + :selector "deposit:" + :params (list "amount") + :temps (list) + :pragmas (list) + :body (list + {:type "assign" + :name "balance" + :expr {:type "send" + :receiver {:type "ident" :name "balance"} + :selector "+" + :args (list {:type "ident" :name "amount"})}} + {:type "return" :expr {:type "self"}})}} + {:kind "end-methods"})) + +(list st-test-pass st-test-fail) diff --git a/plans/smalltalk-on-sx.md b/plans/smalltalk-on-sx.md index 481ef9b0..dec0cd01 100644 --- a/plans/smalltalk-on-sx.md +++ b/plans/smalltalk-on-sx.md @@ -52,7 +52,7 @@ Core mapping: ### Phase 1 — tokenizer + parser - [x] Tokenizer: identifiers, keywords (`foo:`), binary selectors (`+`, `==`, `,`, `->`, `~=` etc.), numbers (radix `16r1F`; **scaled `1.5s2` deferred**), strings `'…''…'`, characters `$c`, symbols `#foo` `#'foo bar'` `#+`, byte arrays `#[1 2 3]` (open token), literal arrays `#(1 #foo 'x')` (open token), comments `"…"` - [x] Parser (expression level): blocks `[:a :b | | t1 t2 | …]`, cascades, message precedence (unary > binary > keyword), assignment, return, statement sequences, literal arrays, byte arrays, paren grouping, method headers (`+ other`, `at:put:`, unary, with temps and body). Class-definition keyword messages parse as ordinary keyword sends — no special-case needed. -- [ ] Parser (chunk-stream level): `! !` chunk separators driving a sequence of top-level expressions, pragmas `` inside method bodies +- [x] Parser (chunk-stream level): `st-read-chunks` splits source on `!` (with `!!` doubling) and `st-parse-chunks` runs the Pharo file-in state machine — `methodsFor:` / `class methodsFor:` opens a method batch, an empty chunk closes it. Pragmas `` (incl. multiple keyword pairs, before or after temps, multiple per method) parsed into the method AST. - [x] Unit tests in `lib/smalltalk/tests/parse.sx` ### Phase 2 — object model + sequential eval @@ -108,6 +108,7 @@ Core mapping: _Newest first. Agent appends on every commit._ +- 2026-04-25: chunk-stream parser + pragmas + 21 chunk/pragma tests (`lib/smalltalk/tests/parse_chunks.sx`). `st-read-chunks` (with `!!` doubling), `st-parse-chunks` state machine for `methodsFor:` batches incl. class-side. Pragmas with multiple keyword pairs, signed numeric / string / symbol args, in either pragma-then-temps or temps-then-pragma order. 131/131 tests pass. - 2026-04-25: expression-level parser + 47 parse tests (`lib/smalltalk/parser.sx`, `lib/smalltalk/tests/parse.sx`). Full message precedence (unary > binary > keyword), cascades, blocks with params/temps, literal/byte arrays, assignment chain, method headers (unary/binary/keyword). Chunk-format `! !` driver deferred to a follow-up box. 110/110 tests pass. - 2026-04-25: tokenizer + 63 tests (`lib/smalltalk/tokenizer.sx`, `lib/smalltalk/tests/tokenize.sx`, `lib/smalltalk/test.sh`). All token types covered except scaled decimals `1.5s2` (deferred). `#(` and `#[` emit open tokens; literal-array contents lexed as ordinary tokens for the parser to interpret.