commit 45cbd139772d47b5c7d128cbc0cd6243b9581a1c
parent 099992839e3d04488bd9815b8f450677432c2dc1
Author: Virgil Dupras <hsoft@hardcoded.net>
Date: Thu, 16 Jun 2022 13:43:51 -0400
cc/tok: properly tokenize symbols
Previously, it would erronously tokenize an ambiguous stream like "a+++b"
(the correct tokenization being "a ++ + b")
Diffstat:
2 files changed, 67 insertions(+), 16 deletions(-)
diff --git a/fs/cc/tok.fs b/fs/cc/tok.fs
@@ -1,5 +1,49 @@
\ C compiler tokenization
-\ Throughout the CC code, "tok" means a string representing a token.
+\ Throughout the CC code, "tok" means a Token structure, which is:
+
+\ TODO: actually implement this. As of now, tok is the string that was read.
+\ 1b token type id
+\ xb string having been read
+
+0 const TOK_KEYWORD
+1 const TOK_IDENTIFIER
+2 const TOK_CONSTANT
+3 const TOK_STRLIT
+4 const TOK_SYMBOL
+
+create keywords 5 c, ," break" 4 c, ," case" 4 c, ," char"
+ 5 c, ," const" 8 c, ," continue" 7 c, ," default"
+ 2 c, ," do" 6 c, ," double" 4 c, ," else"
+ 4 c, ," enum" 6 c, ," extern" 5 c, ," float"
+ 3 c, ," for" 4 c, ," goto" 2 c, ," if"
+ 6 c, ," inline" 3 c, ," int" 8 c, ," register"
+ 6 c, ," return" 5 c, ," short" 6 c, ," signed"
+ 6 c, ," sizeof" 6 c, ," static" 6 c, ," struct"
+ 6 c, ," switch" 7 c, ," typedef" 5 c, ," union"
+ 8 c, ," unsigned" 4 c, ," void" 8 c, ," volatile"
+ 5 c, ," while" 0 c,
+
+: isKeyword? ( s -- f ) keywords sfind 0>= ;
+
+\ For symbol parsing, we exploit one particularity: all 2 chars symbols start
+\ with a symbol that is also a 1 char symbol and all 3 chars symbols begin with
+\ 2 chars that are also a 2 chars symbol.
+\ list of 1 char symbols
+create symbols1 ," +-*/~&<>=[](){}.%^?:;,"
+
+: isSym1? ( c -- f ) symbols1 22 [c]? 0>= ;
+
+\ list of 2 chars symbols
+create symbols2 ," <=>===!=&&||++---><<>>+=-=*=/=%=&=^=|="
+
+: isSym2? ( c1 c2 -- f )
+ A>r 19 >r symbols2 >A begin ( c1 c2 )
+ over Ac@+ = over Ac@+ = and if 2drop r~ r>A 1 exit then
+ next 2drop 0 r>A ;
+
+\ are c1/c2 either << or >>?
+: is<<>>? ( c1 c2 -- f )
+ dup '<' = over '>' = or rot> ( f1 c1 c2 ) = and ( f ) ;
alias in< cc<
0 value putback
@@ -12,25 +56,32 @@ create _ 6 c, ," AZaz__"
create _ 8 c, ," 09AZaz__"
: identifier? ( c -- f ) _ rmatch ;
-\ list of possible first chars for "special stuff"
-create special1st ," (){}!~+-*/<>=&|;," \ 17
-\ list of possible second chars for "special stuff"
-create special2nd ," =&|" \ 3
-
\ advance to the next non-whitespace and return the char encountered.
\ if end of stream is reached, c is 0
: tonws ( -- c ) 0 begin ( c )
drop _cc< dup dup EOF <= swap ws? not or until ( c )
dup EOF <= if drop 0 then ;
+: _writesym ( c3? c2? c1 len -- str )
+ 4 scratchallot dup >r ( c3? c2? c1 len a )
+ over >r c!+ ( c a ) begin c!+ next drop r> ( str ) ;
+
\ Returns the next token as a string or 0 when there's no more token to consume.
-: nextt ( -- tok-or-0 ) tonws dup if ( c )
+: nextt ( -- tok-or-0 )
+ tonws dup not if ( EOF ) exit then ( c )
+ dup isSym1? if ( c )
+ cc< 2dup isSym2? if ( c1 c2 )
+ 2dup is<<>>? if ( c1 c2 )
+ cc< dup '=' if ( c1 c2 '=' )
+ rot> swap 3 ( '=' c2 c1 len ) _writesym
+ else ( c1 c2 c3 )
+ to putback swap 2 ( c2 c1 len ) _writesym then
+ else swap 2 ( c2 c1 len ) _writesym then
+ else ( c1 c2 ) to putback 1 ( c1 len ) _writesym then
+ else ( c ) \ not a symbol
+ dup identifier? not if _err then
A>r LNSZ scratchallot >A A>r ( R:tok ) 0 Ac!+ ( len placeholder )
- dup identifier? if begin ( c )
- Ac!+ cc< dup identifier? not until to putback
- else \ special characters
- dup special1st 17 [c]? 0< if _err then
- Ac!+ cc<
- dup special2nd 3 [c]? 0< if to putback else Ac!+ then
- then
- r> ( buf ) A> over 1+ - ( tok len ) over c! r>A then ;
+ begin ( c )
+ Ac!+ cc< dup identifier? not until to putback
+ r> ( buf ) A> over 1+ - ( tok len ) over c! r>A
+ then ;
diff --git a/fs/sys/scratch.fs b/fs/sys/scratch.fs
@@ -14,7 +14,7 @@ scratch( value scratch>
: scratchallot ( n -- a )
scratch> over + scratch) >= if scratch( to scratch> then
- to+ scratch> scratch> ( a ) ;
+ scratch> swap to+ scratch> ( a ) ;
\ push a range to the scratchpad as a string
: []>str ( a u -- str )
dup 1+ scratchallot ( src u dst-1 ) >r dup r@ c!+ swap ( src dst u ) move r> ;