duskos

dusk os fork
git clone git://git.alexwennerberg.com/duskos
Log | Files | Refs | README | LICENSE

commit 45cbd139772d47b5c7d128cbc0cd6243b9581a1c
parent 099992839e3d04488bd9815b8f450677432c2dc1
Author: Virgil Dupras <hsoft@hardcoded.net>
Date:   Thu, 16 Jun 2022 13:43:51 -0400

cc/tok: properly tokenize symbols

Previously, it would erronously tokenize an ambiguous stream like "a+++b"
(the correct tokenization being "a ++ + b")

Diffstat:
Mfs/cc/tok.fs | 81++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
Mfs/sys/scratch.fs | 2+-
2 files changed, 67 insertions(+), 16 deletions(-)

diff --git a/fs/cc/tok.fs b/fs/cc/tok.fs @@ -1,5 +1,49 @@ \ C compiler tokenization -\ Throughout the CC code, "tok" means a string representing a token. +\ Throughout the CC code, "tok" means a Token structure, which is: + +\ TODO: actually implement this. As of now, tok is the string that was read. +\ 1b token type id +\ xb string having been read + +0 const TOK_KEYWORD +1 const TOK_IDENTIFIER +2 const TOK_CONSTANT +3 const TOK_STRLIT +4 const TOK_SYMBOL + +create keywords 5 c, ," break" 4 c, ," case" 4 c, ," char" + 5 c, ," const" 8 c, ," continue" 7 c, ," default" + 2 c, ," do" 6 c, ," double" 4 c, ," else" + 4 c, ," enum" 6 c, ," extern" 5 c, ," float" + 3 c, ," for" 4 c, ," goto" 2 c, ," if" + 6 c, ," inline" 3 c, ," int" 8 c, ," register" + 6 c, ," return" 5 c, ," short" 6 c, ," signed" + 6 c, ," sizeof" 6 c, ," static" 6 c, ," struct" + 6 c, ," switch" 7 c, ," typedef" 5 c, ," union" + 8 c, ," unsigned" 4 c, ," void" 8 c, ," volatile" + 5 c, ," while" 0 c, + +: isKeyword? ( s -- f ) keywords sfind 0>= ; + +\ For symbol parsing, we exploit one particularity: all 2 chars symbols start +\ with a symbol that is also a 1 char symbol and all 3 chars symbols begin with +\ 2 chars that are also a 2 chars symbol. +\ list of 1 char symbols +create symbols1 ," +-*/~&<>=[](){}.%^?:;," + +: isSym1? ( c -- f ) symbols1 22 [c]? 0>= ; + +\ list of 2 chars symbols +create symbols2 ," <=>===!=&&||++---><<>>+=-=*=/=%=&=^=|=" + +: isSym2? ( c1 c2 -- f ) + A>r 19 >r symbols2 >A begin ( c1 c2 ) + over Ac@+ = over Ac@+ = and if 2drop r~ r>A 1 exit then + next 2drop 0 r>A ; + +\ are c1/c2 either << or >>? +: is<<>>? ( c1 c2 -- f ) + dup '<' = over '>' = or rot> ( f1 c1 c2 ) = and ( f ) ; alias in< cc< 0 value putback @@ -12,25 +56,32 @@ create _ 6 c, ," AZaz__" create _ 8 c, ," 09AZaz__" : identifier? ( c -- f ) _ rmatch ; -\ list of possible first chars for "special stuff" -create special1st ," (){}!~+-*/<>=&|;," \ 17 -\ list of possible second chars for "special stuff" -create special2nd ," =&|" \ 3 - \ advance to the next non-whitespace and return the char encountered. \ if end of stream is reached, c is 0 : tonws ( -- c ) 0 begin ( c ) drop _cc< dup dup EOF <= swap ws? not or until ( c ) dup EOF <= if drop 0 then ; +: _writesym ( c3? c2? c1 len -- str ) + 4 scratchallot dup >r ( c3? c2? c1 len a ) + over >r c!+ ( c a ) begin c!+ next drop r> ( str ) ; + \ Returns the next token as a string or 0 when there's no more token to consume. -: nextt ( -- tok-or-0 ) tonws dup if ( c ) +: nextt ( -- tok-or-0 ) + tonws dup not if ( EOF ) exit then ( c ) + dup isSym1? if ( c ) + cc< 2dup isSym2? if ( c1 c2 ) + 2dup is<<>>? if ( c1 c2 ) + cc< dup '=' if ( c1 c2 '=' ) + rot> swap 3 ( '=' c2 c1 len ) _writesym + else ( c1 c2 c3 ) + to putback swap 2 ( c2 c1 len ) _writesym then + else swap 2 ( c2 c1 len ) _writesym then + else ( c1 c2 ) to putback 1 ( c1 len ) _writesym then + else ( c ) \ not a symbol + dup identifier? not if _err then A>r LNSZ scratchallot >A A>r ( R:tok ) 0 Ac!+ ( len placeholder ) - dup identifier? if begin ( c ) - Ac!+ cc< dup identifier? not until to putback - else \ special characters - dup special1st 17 [c]? 0< if _err then - Ac!+ cc< - dup special2nd 3 [c]? 0< if to putback else Ac!+ then - then - r> ( buf ) A> over 1+ - ( tok len ) over c! r>A then ; + begin ( c ) + Ac!+ cc< dup identifier? not until to putback + r> ( buf ) A> over 1+ - ( tok len ) over c! r>A + then ; diff --git a/fs/sys/scratch.fs b/fs/sys/scratch.fs @@ -14,7 +14,7 @@ scratch( value scratch> : scratchallot ( n -- a ) scratch> over + scratch) >= if scratch( to scratch> then - to+ scratch> scratch> ( a ) ; + scratch> swap to+ scratch> ( a ) ; \ push a range to the scratchpad as a string : []>str ( a u -- str ) dup 1+ scratchallot ( src u dst-1 ) >r dup r@ c!+ swap ( src dst u ) move r> ;