commit 180bb94f0f1e24cffbb5772abd8b2261a874e804
parent 8875a29ca72770d70422b8723737aad5428afdb1
Author: Virgil Dupras <hsoft@hardcoded.net>
Date: Sat, 5 Nov 2022 11:42:59 -0400
cc: change the way macro works
see doc/cc/usage.
While the "use the power of Forth directly" sounds great, when wanting to add
tooling for complex features such as parametrized macro expansion, I couldn't
come up with ways that are more straightforward than a CPP-like syntax. All
ideas I had seemed awkward to me, so I figured it was a winning more to stay as
close as possible to CPP, interface-wise.
The fundamental approach to macros is still the same, but the interface to it
changes slightly, is simplified. The idea is that we have a "#define" that works
mostly like the CPP version (with big caveats for not being text-based, but
AST-based), but we also have #forthdef for direct forth interpreter access.
Diffstat:
11 files changed, 166 insertions(+), 131 deletions(-)
diff --git a/ROADMAP.md b/ROADMAP.md
@@ -14,6 +14,8 @@ consider complete:
* enum
* goto
* float
+* macro expansion in #define
+* #if/#else/#endif
* a few little ops here and there
* the check phase (anything that is currently understood by the parser is
compiled no matter how nonsensical).
diff --git a/fs/app/cos/cvm.c b/fs/app/cos/cvm.c
@@ -1,18 +1,20 @@
// This unit compiles without error, but it doesn't actually work yet.
-#[
- $10000 const MEMSIZE
- $fffa const SP_ADDR
- $ff00 const RS_ADDR
- $fe00 const SYSVARS
- \ Port for block reads. Each read or write has to be done in 5 IO writes:
- \ 1 - r/w. 1 for read, 2 for write.
- \ 2 - blkid MSB
- \ 3 - blkid LSB
- \ 4 - dest addr MSB
- \ 5 - dest addr LSB
- $03 const BLK_PORT
- 4 const BLKOP_CMD_SZ
-]#
+
+#define MEMSIZE $10000
+#define SP_ADDR $fffa
+#define RS_ADDR $ff00
+#define SYSVARS $fe00
+// TODO: allow constant expressions in macros to allow stuff like SYSVARS + $18
+#define SYSVARS_TOPTR $fe18
+/* Port for block reads. Each read or write has to be done in 5 IO writes:
+ * 1 - r/w. 1 for read, 2 for write.
+ * 2 - blkid MSB
+ * 3 - blkid LSB
+ * 4 - dest addr MSB
+ * 5 - dest addr LSB
+ */
+#define BLK_PORT $03
+#define BLKOP_CMD_SZ 4
typedef unsigned char byte;
typedef unsigned short word;
@@ -21,7 +23,7 @@ typedef void (*IOWR) (byte);
typedef void (*VMOP) ();
struct COSVM {
- byte mem[ #[ MEMSIZE ]# ];
+ byte mem[MEMSIZE];
word SP; /* parameter Stack Pointer */
word RS; /* Return Stack pointer */
word IP; /* Interpreter Pointer */
@@ -41,7 +43,7 @@ File *blkfp = NULL;
/* Stores blkop command. Bytes flow from left (byte 0) to right (byte 3)
* We know we have a full command when last byte is nonzero. After
* processing the cmd, we reset blkop to 0. */
-static byte blkop[ #[ BLKOP_CMD_SZ c]# ];
+static byte blkop[BLKOP_CMD_SZ];
/* Read single byte from I/O handler, if set. addr is a word only because of
Forth's cell size, but can't actually address more than a byte-full of ports.
@@ -79,7 +81,7 @@ static void iowr_blk(byte val)
if (rw) {
blkid = (word)blkop[2] << 8 | (word)blkop[1];
dest = (word)blkop[0] << 8 | (word)val;
- memset(blkop, 0, #[ BLKOP_CMD_SZ c]# );
+ memset(blkop, 0, BLKOP_CMD_SZ);
fseek(blkid*1024, blkfp);
if (rw==2) { /* write */
fwrite(&vm.mem[dest], 1024, blkfp);
@@ -135,8 +137,8 @@ static void lblxt() { pushRS(vm.IP); vm.IP = pop(); lblnext(); }
static void lbldoes() { vm.PC = pop(); push(vm.PC+2); vm.PC = gw(vm.PC); }
static void lblval() {
word a;
- if (vm.mem[ #[ SYSVARS $18 + c]# ]) { // TO?
- vm.mem[ #[ SYSVARS $18 + c]# ] = 0;
+ if (vm.mem[SYSVARS_TOPTR]) { // TO?
+ vm.mem[SYSVARS_TOPTR] = 0;
a = pop();
sw(a, pop());
} else {
@@ -197,10 +199,10 @@ static void DIVMOD() {
word b = pop(); word a = pop();
push(a % b); push(a / b);
}
-static void QUIT() { vm.RS = #[ RS_ADDR c]# ; }
-static void ABORT() { vm.SP = #[ SP_ADDR c]# ; }
-static void RCNT() { push((vm.RS - #[ RS_ADDR c]# ) / 2); }
-static void SCNT() { push((#[ SP_ADDR c]# - vm.SP) / 2); }
+static void QUIT() { vm.RS = RS_ADDR; }
+static void ABORT() { vm.SP = SP_ADDR; }
+static void RCNT() { push((vm.RS - RS_ADDR) / 2); }
+static void SCNT() { push((SP_ADDR - vm.SP) / 2); }
static void BYE() { vm.running = 0; }
static void EXIT() { vm.IP = popRS(); }
static void CDUP() { word a = peek(); if (a) push(a); }
@@ -209,8 +211,8 @@ static void LIT16() { push(gw(vm.IP)); vm.IP+=2; }
static void LT() {
word b = pop(); word a = pop(); push(a<b); }
-#[ 67 const OPCNT ]#
-static VMOP ops[ #[ OPCNT ]# ] = {
+#define OPCNT 67
+static VMOP ops[OPCNT] = {
DUP, DROP, PUSHi, PUSHii, SWAP, OVER, ROT, lblnext,
CBR, NEXT, CALLi, JMPi, lblxt, EXIT, CDUP, LIT8,
LIT16, JMPii, lbldoes, lblval, NULL, EXECUTE, NULL, NULL,
@@ -223,7 +225,7 @@ static VMOP ops[ #[ OPCNT ]# ] = {
};
static void opexec(byte op) {
- if (op < #[ OPCNT c]# ) {
+ if (op < OPCNT) {
ops[op]();
} else {
fprintf(op, vm.PC, "Out of bounds op %w. PC: %w\n", ConsoleOut());
@@ -233,14 +235,14 @@ static void opexec(byte op) {
void COS_init()
{
- memset(blkop, 0, #[ BLKOP_CMD_SZ c]# );
- vm.SP = #[ SP_ADDR c]# ;
- vm.RS = #[ RS_ADDR c]# ;
+ memset(blkop, 0, BLKOP_CMD_SZ);
+ vm.SP = SP_ADDR;
+ vm.RS = RS_ADDR;
vm.minSP = vm.SP ;
vm.maxRS = vm.RS ;
memset(vm.iord, 0, $400);
memset(vm.iowr, 0, $400);
- vm.iowr[ #[ BLK_PORT c]# ] = iowr_blk;
+ vm.iowr[BLK_PORT] = iowr_blk;
vm.PC = 0;
vm.running = 1;
}
diff --git a/fs/cc/ast.fs b/fs/cc/ast.fs
@@ -6,7 +6,7 @@
?f<< lib/wordtbl.fs
?f<< cc/tok.fs
?f<< cc/tree.fs
-?f<< cc/macro.fs
+?f<< cc/macrolo.fs
?f<< cc/type.fs
\ This arena contains AST structures for the unit being currently parsed.
@@ -202,10 +202,6 @@ extends ASTNode struct[ StrLit
: :new AST_STRLIT ASTNode :new ;
]struct
-\ Macro shortcuts
-: c]# Constant :new ]# ;
-: i]# Ident :new ]# ;
-
ASTIDCNT stringlist astidnames
"declare" "unit" "function" "return" "constant" "stmts" "unused" "ident"
"unaryop" "postop" "binop" "list" "if" "str" "call" "for" "push" "pop" "break"
@@ -327,13 +323,13 @@ alias noop parseFactor ( tok -- node ) \ forward declaration
nextt '(' expectChar nextt ')' expectChar
AST_PSPOP ASTNode :new parsePostfixOp
endof
- S" #[" of s= #[1 ( node ) parsePostfixOp endof
S" NULL" of s= 0 Constant :new endof
of uopid ( opid )
UnaryOp :new ( opnode )
nextt parseFactor over Node :add ( opnode ) endof
- of isIdent? ( ) \ lvalue or FunCall
- r@ Ident :new ( inode ) parsePostfixOp
+ of isIdent? ( ) \ lvalue, FunCall or macro
+ r@ findMacro ?dup if Macro ast else r@ Ident :new then ( node )
+ parsePostfixOp
endof
( case else ) \ Constant
r@ parse if Constant :new else _err then
@@ -463,7 +459,9 @@ current to parseStatement
\\ Parse the next element in a Unit node
: parseUnit ( tok -- node-or-0 )
- dup S" #[" s= if drop #[0 0 exit then
+ dup '#' isChar? if
+ drop nextt ['] MacroOps structdict' find ?dup _assert
+ execute 0 exit then
0 to curstatic
dup S" static" s= if drop nextt 1 to curstatic then
parseType _assert ( type )
diff --git a/fs/cc/cc.fs b/fs/cc/cc.fs
@@ -2,6 +2,7 @@
?f<< /cc/vm/vm.fs
?f<< /cc/ttr.fs
?f<< /cc/gen.fs
+?f<< /cc/macro.fs
: _err ( -- ) abort" CC error" ;
: _assert ( f -- ) not if _err then ;
@@ -9,15 +10,15 @@
\ Compiles input coming from the stdin alias and writes the
\ result to here. Aborts on error.
: cc1, ( -- )
- cctypes$ begin ( )
+ cctypes$ ccast$ cmacro$ begin ( )
nextt? ?dup while parseUnit ( node-or-0 ) ?dup if
_ccdebug if dup printast nl> then ( node )
dup trnode _ccdebug if dup printast nl> then ( node )
- gennode ccast$ then repeat ;
+ gennode then repeat ;
: :c
nextt parseUnit ?dup if
_ccdebug if dup printast nl> then
- dup trnode _ccdebug if dup printast nl> then gennode ccast$ then ;
+ dup trnode _ccdebug if dup printast nl> then gennode then ;
: cc<< ( -- ) ['] cc1, word with-stdin-file ;
diff --git a/fs/cc/macro.fs b/fs/cc/macro.fs
@@ -1,17 +1,18 @@
-\ CC macros
-?f<< /cc/tree.fs
+\ CC macros (high part)
+?f<< /cc/macrolo.fs
+?f<< /cc/ast.fs
-\ Macros. See doc/cc
--1 value _pslvl \ PS level at last #[
+: _err ( -- ) abort" macro error" ;
+: _assert ( f -- ) not if _err then ;
-: _ begin word runword _pslvl 0< until ;
-: runmacro ['] _ with-stdin< ;
+struct+[ Macro
+ : _asnum ( self -- n )
+ ast dup Node id AST_CONSTANT = _assert
+ Constant value ;
+ current to :asnum
-: #[0 scnt to _pslvl runmacro ;
-: #[1 scnt 1+ to _pslvl runmacro ;
-
-: ]#
- scnt _pslvl - ?dup if abort" PS imbalance during macros" then
- -1 to _pslvl ;
-
-: +]# over Node :add ]# ;
+struct+[ MacroOps
+ : define ( -- )
+ nextt nextt parseExpression ( name exprnode )
+ Macro :new swap addMacro ;
+]struct
diff --git a/fs/cc/macrolo.fs b/fs/cc/macrolo.fs
@@ -0,0 +1,37 @@
+\ CC macros (low part)
+?f<< /lib/arena.fs
+?f<< /cc/tree.fs
+
+\ Holds defined macros
+Arena :new structbind Arena _arena
+
+struct[ Macro
+ \ For now, a macro is only a link to an AST, but when parameters are
+ \ introduced, it will be more.
+ sfield ast
+ : :new ( ast -- macro ) CELLSZ _arena :[ , _arena :] ;
+ alias _err :asnum ( self -- n ) \ forward declaration
+]struct
+
+create macros 0 , 0 c, \ this is a dict link
+: addMacro ( macro name -- )
+ dup c@ macros rot> ( macro 'dict name len )
+ ENTRYSZ + 8 + _arena :[ entry , _arena :] drop ;
+: findMacro ( name -- macro-or-0 ) macros find dup if @ then ;
+
+: cmacro$ _arena :reset 0 macros ! ;
+
+struct[ MacroOps
+ -1 value pslvl \ PS level at last runmacro
+ : _ begin word runword pslvl 0< until ;
+ : runmacro scnt 1+ to pslvl ['] _ with-stdin< ;
+
+ : forthdef
+ nextt ( name ) runmacro ( name node )
+ Macro :new swap addMacro ;
+]struct
+
+: ]#
+ scnt MacroOps pslvl - ?dup if abort" PS imbalance during macros" then
+ -1 to MacroOps pslvl ;
+
diff --git a/fs/cc/tok.fs b/fs/cc/tok.fs
@@ -29,10 +29,10 @@ create symbols1 ," +-*/~&<>=[](){}.%^?:;,|^#\"!"
: isSym1? ( c -- f ) symbols1 27 [c]? 0>= ;
\ list of 2 chars symbols
-create symbols2 ," <=>===!=&&||++---><<>>+=-=*=/=%=&=^=|=/**///#["
+create symbols2 ," <=>===!=&&||++---><<>>+=-=*=/=%=&=^=|=/**///"
: isSym2? ( c1 c2 -- f )
- 8 lshift or symbols2 >r 23 >r begin ( c1+c2 )
+ 8 lshift or symbols2 >r 22 >r begin ( c1+c2 )
dup 16b to@+ V1 = if drop rdrop rdrop 1 exit then
next drop rdrop 0 ;
@@ -63,8 +63,12 @@ create _ 10 c, ," 09AZaz__$$"
4 _pad :allot dup >r ( c3? c2? c1 len a )
over >r c!+ ( c a ) begin c!+ next drop r> ( str ) ;
+\ if not 0, next nextt call will fetch token from here
+0 value nexttputback
+
\ Returns the next token as a string or 0 when there's no more token to consume.
: nextt? ( -- tok-or-0 )
+ nexttputback ?dup if 0 to nexttputback exit then
tonws dup not if ( EOF ) exit then ( c ) case
of isSym1? ( )
r@ stdin 2dup isSym2? if ( c1 c2 )
@@ -96,14 +100,7 @@ create _ 10 c, ," 09AZaz__$$"
_err
endcase ;
-\ if not 0, next nextt call will fetch token from here
-0 value nexttputback
-
-\ Fetch the next token, aborting if there's none. Also, apply the "putback"
-\ logic.
-: nextt ( -- tok )
- nexttputback ?dup if 0 to nexttputback exit then
- nextt? ?dup not if abort" expecting token!" then ;
+: nextt ( -- tok ) nextt? dup _assert ;
: expectConst ( tok -- n ) dup parse if nip else _err then ;
: isIdent? ( tok -- f )
diff --git a/fs/cc/type.fs b/fs/cc/type.fs
@@ -1,8 +1,8 @@
\ C compiler types
?f<< /lib/str.fs
-?f<< lib/arena.fs
+?f<< /lib/arena.fs
?f<< /cc/tok.fs
-?f<< /cc/macro.fs
+?f<< /cc/macrolo.fs
\ This arena is for local typedefs for a single unit.
Arena :new structbind Arena _arena
@@ -213,7 +213,9 @@ alias _err parseDeclarator ( type -- ctype ) \ forward declaration
: _post ( ctype -- ctype )
begin ( ctype ) nextt case
'[' of isChar?^
- nextt dup S" #[" s= if drop #[1 else parse _assert then
+ nextt dup isIdent? if
+ findMacro ?dup _assert Macro :asnum
+ else parse _assert then ( ctype nbelem )
nextt ']' expectChar ( ctype nbelem )
over to CType nbelem endof
'(' of isChar?^
diff --git a/fs/doc/cc/usage.txt b/fs/doc/cc/usage.txt
@@ -23,7 +23,8 @@ emptying of local buffers occurs at the beginning of "cc<<".
Writing for DuskCC is the same as writing for another ANSI C compiler, but there
are a few differences:
-* no C preprocessor, the preprocessor is Forth itself, through macros.
+* no C preprocessor. We have "#define" (and more), but it's a macro system that
+ works with the AST directly.
* no 64bit types
* no long, redundant with int
* no double, float is always 32b
@@ -70,55 +71,37 @@ int mymax(int a, int b) {
## Macros
-Macros in Dusk's CC are simply markers inside which arbitrary Forth code is
-interpreted. Those markers are #[ and ]#. Those markers are executed during the
-AST generation phase, which means that you can arbitrarily modify the AST at any
-point during parsing.
-
-A common case with C macros is the definition and reuse of constants. Here's how
-it looks:
-
-#[ 42 const FOOBAR ]#
-int foo() {
- return #[ FOOBAR c]# ; // c]# means "Constant :new ]#"
-}
-
-Because macros can modify the AST, they can only be inserted at certain
-designated places, known as "hash (#) bars". These are:
-
-* In a Unit context (in between functions)
-* Replacing a "factor" AST element, which are quite numerous. Some of them:
- * A constant
- * A Lvalue (AST_IDENT)
- * A function call
- * An expression
-* Inside an array length [] definition.
-
-In any other place, "#[" will be a parse error.
-
-In the first case, the signature of the macro is ( node -- node ). By using PS
-TOS, you can add a node to the active Unit.
-
-The second case has a signature ( -- node ), that is, you are expected to put a
-node that is in the context you're putting it. It will then be added wherever
-the factor was expected. It will even have postfix AST rules applied to it,
-which opens nice doors. For example, if your macro returns a simple AST_IDENT,
-then right after the macro you can add parens to make it into a function call.
-
-The third case has a signature ( -- number ), with "number" being the number of
-elements that the array being defined will have.
-
-When a macro begins, PS level is recorded. If it doesn't end with the correct
-PS size, an error is raised.
-
-Macro opening symbol, "#[", obeys C tokenization rules, but the closing one,
-"]#", obeys Forth tokenization rules, so it has to be followed by a space.
-
-There are "shortcut words" for closing a macro:
-
-c]# --> Constant :new ]#
-i]# --> Ident :new ]#
-+]# --> over Node :add ]#
+Macros are predefined AST fragments that are inserted in the tree when we refer
+to them (parametrizable fragments is a planned feature, but not implemented
+yet). You can define macros with a "#" family of words which are only valid at
+the top level of the unit source code. Example usage:
+
+ #define MYCONST 42
+ int foo() { return MYCONST; }
+
+When "#define" is called, the C expression following it is parsed and the
+resulting AST is saved and linked to the MYCONST name. Then, during subsequent
+expression parsing, whenever MYCONST is encountered, that AST in inserted.
+
+Macros have precedence over other identifiers. A variable named MYCONST would be
+shadowed by this macro.
+
+Macro references can be placed anywhere an expression can be, as well as inside
+"[]" brackets in typedefs. In the latter case, however, you can only use
+expressions that resolve to a constant number.
+
+Another way to define macros is "#forthdef". This word takes the C compiler in
+Forth interpret mode, where you can write arbitrary Forth code. You end this
+mode with "]#", which takes you back in CC mode. During that time, you are
+expected to put an AST node on PS, which is the node to be inserted when
+invoking it. One use of this is to refer to Forth words with names that are not
+valid C identifiers:
+
+ #forthdef INRANGE S" =><=" Ident :new ]#
+ int isinrange(int n, int l, int h) {
+ INRANGE(n, l, h);
+ return pspop();
+ }
## Linkage and persistence
diff --git a/fs/doc/design/simple.txt b/fs/doc/design/simple.txt
@@ -38,13 +38,24 @@ That is why Forth's approach to simplicity is revolutionary, because it removes
a blindfold.
A second simplicity factor is pre-processing. The C pre-processor is a very
-important part of the compiler. Without it, C is much less powerful. Tcc
-dedicates 3900 lines of code to the pre-processor. In Dusk CC, it's 16 lines.
-Why? because the pre-processor is Forth itself. Sure, it's not quite the CPP
-we're used to, but it's in fact much more powerful and extensible. You have the
-whole system available to you in those macros. In this case, Forth is simpler by
-cleverly piggy-backing on existing logic in ways we're not used to with our UNIX
-blindfolds.
+important part of the compiler. Without it, C is much less powerful. DuskCC
+doesn't have a pre-processor, but it has a macro system that provide #define and
+friends. This macro system isn't complete yet, but at this moment, it's about 60
+lines of code. When it manages to be powerful enough to be considered an
+adequate replacement to the C pre-processor, I don't think it will be over 300
+lines of code.
+
+Why this difference? While the two approaches are not functionally equivalent
+(one is a textual macro processor and the other manipulates the C AST directly
+and works within the regular C tokenizer and parser). However, the C
+pre-processor forces us to reimplement a big part of tokenizing and parsing
+rules, but in a slightly different manner. When you look at tccpp.c, you can see
+the code is mostly about tokenizing and parsing. It's also striking to see the
+amount of boilerplate that you need when you're processing text and spitting
+text.
+
+DuskCC sidesteps that complexity by piggy-backing on its existing tokenizer and
+parser and manipulate the AST directly.
A third simplicity factor is parsing boilerplate. Tcc's assembler's input is
text formatted in GNU assembler format. This parsing boilerplate is a
diff --git a/fs/tests/cc/test.c b/fs/tests/cc/test.c
@@ -1,10 +1,10 @@
/* test a few simple C constructs */
-#[ 42 const MYCONST ]#
+#define MYCONST 42
// just return a constant
short retconst() {
- return #[ MYCONST c]# ;
+ return MYCONST;
}
short variables() {
short foo = 40, _bar = 2;
@@ -127,8 +127,8 @@ int array() {
}
static int global1 = 1234;
-#[ 3 const GLOB2SZ ]#
-static int global2[ #[ GLOB2SZ ]# ] = {4, 5, 6};
+#define GLOB2SZ 3
+static int global2[GLOB2SZ] = {4, 5, 6};
int global() {
return global1;
@@ -156,8 +156,9 @@ void helloworld() {
// Here, we see the power of macros in action. Let's say we want to call the
// system word "=><=". It's not a valid C identifier, right? ok, but what about
// using macros to trick the parser into accepting it?
+#forthdef INRANGE S" =><=" Ident :new ]#
int isinrange(int n, int l, int h) {
- #[ S" =><=" i]# (n, l, h);
+ INRANGE(n, l, h);
return pspop();
}
int forsum(int n) {