cleaned up code, especially in the tokenizer - Aria - A low-level systems programming language

commit 283ecfcc4c23b0196e824239c103fae99b67ef29
parent 049ae43efd3e365198d94cbc8e92688372e468c0
Author: m21c  <ho*******@gmail.com>
Date:   Sat, 15 Jan 2022 22:33:34 +0100

cleaned up code, especially in the tokenizer

Diffstat:
M compiler.c  | 233 +++++++++++++++++++++++++++++++++++++++++++++++--------------------------------

1 file changed, 139 insertions(+), 94 deletions(-)
diff --git a/compiler.c b/compiler.c
@@ -951,13 +951,22 @@ error(SrcLoc *loc, const char *fmt, ...)
 #define nextindent(source, indent) \
 	((indent) + (source)->tabwidth - ((indent) % (source)->tabwidth))
 
+#define peekchar(source) \
+	((source)->line[(source)->currloc.column])
+
+#define peeknextchar(source) \
+	((source)->line[(source)->currloc.column + 1])
+
+#define nextchar(source) \
+	((source)->line[++(source)->currloc.column])
+
 static int
-tokenizealphanumeric(Source *source, register int c0)
+tokenizealphanumeric(Source *source, register int ch)
 {
 	int keyword;
 
-	while (isalnum(c0) || c0 == '_')
-		c0 = source->line[++source->currloc.column];
+	while (isalnum(ch) || ch == '_')
+		ch = nextchar(source);
 
 	keyword = getkeyword(
 		source->line + source->tok.loc.column,
@@ -1009,6 +1018,7 @@ suffixfloattype(Source *source, const char *end)
 
 		if (end[1])
 			goto errorfloat;
+
 	} else if (!mystrcasecmp(end, "f32") || !mystrcasecmp(end, "r32")) {
 		ty = prim + TF32;
 
@@ -1035,10 +1045,12 @@ suffixinttype(Source *source, const char *end)
 	case 's': case 'S': case 'i': case 'I':
 		typeid = 0;
 
+		/* fallthrough */
 	case 'u': case 'U':
 		++end;
 		if (*end == 0) {
 			return prim + (typeid + TINFER);
+
 		} else if (*end == '8') {
 			typeid += TS8;
 
@@ -1046,19 +1058,25 @@ suffixinttype(Source *source, const char *end)
 				goto errorint;
 
 			return prim + typeid;
+
 		} else if (!strcmp(end, "16")) {
 			return prim + (typeid + TS16);
+
 		} else if (!strcmp(end, "32")) {
 			return prim + (typeid + TS32);
+
 		} else if (!strcmp(end, "64")) {
 			return prim + (typeid + TS64);
+
 		} else if (!mystrcasecmp(end, "sz")) {
 			return prim + (typeid + TSSIZE);
 		}
 
+		/* fallthrough */
 	default:
 		if (!mystrcasecmp(end, "ll")) {
 			return prim + (typeid + TLLONG);
+
 		} else if (*end == 'l' || *end == 'L') {
 			typeid += TLONG;
 
@@ -1075,30 +1093,30 @@ errorint:
 }
 
 static int
-tokenizenumber(Source *source, register int c0)
+tokenizenumber(Source *source, register int ch)
 {
-	int l = c0, t = source->line[source->currloc.column+1], i, j;
+	int l = ch, t = peeknextchar(source), i, j;
 	bool hasdec = false, hasexp = false;
 	char *end;
 
 advancenum:
-	while (isalnum(c0) || c0 == '_' || (c0 == '.' &&
-	       source->line[source->currloc.column+1] != '.' && !hasdec))
+	while (isalnum(ch) || ch == '_' ||
+	       (ch == '.' && peeknextchar(source) != '.' && !hasdec))
 	{
-		if (c0 != '_')
-			l = c0;
-		if (c0 == '.')
+		if (ch != '_')
+			l = ch;
+		if (ch == '.')
 			hasdec = true;
 
-		c0 = source->line[++source->currloc.column];
+		ch = nextchar(source);
 	}
 
-	if (hasdec && !hasexp && (c0 == '+' || c0 == '-')) {
+	if (hasdec && !hasexp && (ch == '+' || ch == '-')) {
 		t = tolower(t);
 		l = tolower(l);
 
 		if ((l == 'e' && t != 'x') || (l == 'p' && t == 'x')) {
-			c0 = source->line[++source->currloc.column];
+			ch = nextchar(source);
 			hasexp = true;
 
 			goto advancenum;
@@ -1134,6 +1152,7 @@ advancenum:
 	{
 		source->tok.u.d = strtod(source->stringbuf, &end);
 		source->tok.type = suffixfloattype(source, end);
+
 	} else {
 		if (mystrncasecmp(source->stringbuf, "0b", 2) == 0) {
 			source->tok.u.u = strtoull(
@@ -1156,41 +1175,41 @@ advancenum:
 }
 
 static int
-tokenizestring(Source *source, register int c0)
+tokenizestring(Source *source, register int ch)
 {
-	int delim = c0, j;
+	int delim = ch, j;
 
-	c0 = source->line[++source->currloc.column];
+	ch = nextchar(source);
 	source->tok.loc.column = source->currloc.column;
 
 	j = source->currloc.column;
-	while (c0 != delim && c0 != 0) {
-		if (c0 == '\\') {
-			c0 = source->line[++source->currloc.column];
+	while (ch != delim && ch != 0) {
+		if (ch == '\\') {
+			ch = nextchar(source);
 
-			switch (c0) {
+			switch (ch) {
 			case '\\':
-				c0 = '\\';
+				ch = '\\';
 				break;
 
 			case 'n':
-				c0 = '\n';
+				ch = '\n';
 				break;
 
 			case 'r':
-				c0 = '\r';
+				ch = '\r';
 				break;
 
 			case 't':
-				c0 = '\t';
+				ch = '\t';
 				break;
 
 			case '\'':
-				c0 = '\'';
+				ch = '\'';
 				break;
 
 			case '"':
-				c0 = '"';
+				ch = '"';
 				break;
 
 			/* TODO(m21c): read more escape sequences */
@@ -1199,18 +1218,18 @@ tokenizestring(Source *source, register int c0)
 
 			default:
 				error(&source->currloc,
-					"invalid escape sequence '\\%c'", c0);
+					"invalid escape sequence '\\%c'", ch);
 			}
 		}
 
-		source->line[j++] = c0;
-		c0 = source->line[++source->currloc.column];
+		source->line[j++] = ch;
+		ch = nextchar(source);
 	}
 
 	++source->currloc.column;
 	source->line[j++] = 0;
 
-	if (c0 == 0) {
+	if (ch == 0) {
 	stringeol:
 		error(&source->currloc, "unexpected end-of-line");
 
@@ -1231,7 +1250,7 @@ tokenizestring(Source *source, register int c0)
 static int
 gettok(Source *source)
 {
-	register int c0 = (uchar) source->line[source->currloc.column];
+	register int ch = (uchar) peekchar(source);
 	static bool hasnewline = false;
 
 	source->lastkind = source->tok.kind;
@@ -1243,17 +1262,18 @@ skipwhite:
 			return source->tok.kind = 0;
 		}
 
-		c0 = source->line[(source->currloc.column = 0)];
+		source->currloc.column = 0;
+		ch = peekchar(source);
 	}
 
 	if (source->currloc.column) {
-		while (isspace(c0))
-			c0 = source->line[++source->currloc.column];
+		while (isspace(ch))
+			ch = nextchar(source);
 
 	} else {
 		source->lastindent = 0;
-		while (isspace(c0)) {
-			if (c0 == '\t') {
+		while (isspace(ch)) {
+			if (ch == '\t') {
 				source->lastindent = nextindent(
 					source,
 					source->lastindent
@@ -1262,7 +1282,7 @@ skipwhite:
 				++source->lastindent;
 			}
 
-			c0 = source->line[++source->currloc.column];
+			ch = nextchar(source);
 		}
 	}
 
@@ -1273,7 +1293,7 @@ skipwhite:
 	source->tok.loc.column = source->currloc.column;
 
 	/* get line */
-	if (!c0 || c0 == '#') {
+	if (!ch || ch == '#') {
 		if (hasnewline) {
 			goto skipwhite;
 		} else {
@@ -1285,109 +1305,107 @@ skipwhite:
 	hasnewline = false;
 
 	/* identifier or keyword */
-	if (isalpha(c0) || c0 == '_') {
-		return tokenizealphanumeric(source, c0);
-	}
+	if (isalpha(ch) || ch == '_')
+		return tokenizealphanumeric(source, ch);
 
 	/* number literal */
-	if (isdigit(c0) || (c0 == '.' &&
-	    isdigit(source->line[source->currloc.column+1])))
-	{
-		return tokenizenumber(source, c0);
-	}
+	if (isdigit(ch) || (ch == '.' && isdigit(peeknextchar(source))))
+		return tokenizenumber(source, ch);
 
 	/* string & character-literal */
-	if (c0 == '"' || c0 == '\'') {
-		return tokenizestring(source, c0);
-	}
+	if (ch == '"' || ch == '\'')
+		return tokenizestring(source, ch);
 
 	/* delimiters */
-	switch (c0) {
+	switch (ch) {
 	case ',': case ';': case '@': case ':':
 	case '{': case '}':
 	case ']': case '[':
 	case '(': case ')':
 		++source->currloc.column;
-		return source->tok.kind = c0;
+		return source->tok.kind = ch;
 	}
 
 	/* operators */
 #define select(ch, then, otherwise) ( \
-		source->line[source->currloc.column] == (ch) ? \
+		peekchar(source) == (ch) ? \
 		++source->currloc.column, (then) : \
 		(otherwise) \
 	)
-	switch (source->line[source->currloc.column++]) {
+
+	++source->currloc.column;
+	switch (ch) {
 	case '.':
 		/* tok.kind = select('.', ORANGE, ODISP); */
-		c0 = ODISP;
-		goto joinop;
+		ch = ODISP;
+		break;
 
 	case '*':
-		c0 = select('=', OMULA, OMUL);
-		goto joinop;
+		ch = select('=', OMULA, OMUL);
+		break;
 
 	case '/':
-		c0 = select('=', ODIVA, ODIV);
-		goto joinop;
+		ch = select('=', ODIVA, ODIV);
+		break;
 
 	case '%':
-		c0 = select('=', OMODA, OMOD);
-		goto joinop;
+		ch = select('=', OMODA, OMOD);
+		break;
 
 	case '<':
-		c0 = select('=', OLEQ,
+		ch = select('=', OLEQ,
 			select('<',
 				select('=', OLSHA, OLSH),
 			OLET));
-		goto joinop;
+		break;
 
 	case '>':
-		c0 = select('=', OGEQ,
+		ch = select('=', OGEQ,
 			select('>',
 				select('>',
 					select('=', OARSHA, OARSH),
 					select('=', ORSHA, ORSH)),
 				OGRT));
-		goto joinop;
+		break;
 
 	case '&':
-		c0 = select('=', OANDA, select('&', OLAND, OBAND));
-		goto joinop;
+		ch = select('=', OANDA, select('&', OLAND, OBAND));
+		break;
 
 	case '+':
-		c0 = select('=', OADDA, select('+', OSUFINC, OADD));
-		goto joinop;
+		ch = select('=', OADDA, select('+', OSUFINC, OADD));
+		break;
 
 	case '-':
-		c0 = select('=', OSUBA, select('-', OSUFDEC, OSUB));
-		goto joinop;
+		ch = select('=', OSUBA, select('-', OSUFDEC, OSUB));
+		break;
 
 	case '|':
-		c0 = select('=', OORA, select('|', OLOR, OBOR));
-		goto joinop;
+		ch = select('=', OORA, select('|', OLOR, OBOR));
+		break;
 
 	case '^':
-		c0 = select('=', OXORA, OXOR);
-		goto joinop;
+		ch = select('=', OXORA, OXOR);
+		break;
 
 	case '!':
-		c0 = select('=', ONEQ, OLNOT);
-		goto joinop;
+		ch = select('=', ONEQ, OLNOT);
+		break;
 
 	case '~':
-		c0 = select('=', OFLIP, OBNOT);
-		goto joinop;
+		ch = select('=', OFLIP, OBNOT);
+		break;
 
 	case '=':
-		c0 = select('=', select('=', OIDENT, OEQU), OASS);
-	joinop:
-		return source->tok.kind = c0;
+		ch = select('=', select('=', OIDENT, OEQU), OASS);
+		break;
 
 	default:
-		error(&source->currloc, "invalid input character '%c'", c0);
+		error(&source->currloc, "invalid input character '%c'", ch);
 		return 'Z';
 	}
+
+	return source->tok.kind = ch;
 #undef select
 }
 
@@ -1423,7 +1441,8 @@ getunary(Kind kind)
 	case OSUB: return OMINUS;
 	case OSUFINC: return OINC;
 	case OSUFDEC: return ODEC;
-	default: return 0;
+	default:
+		return 0;
 	}
 }
 
@@ -1458,7 +1477,8 @@ getunarysuffix(Source *source)
 	switch (kind) {
 	case '(': return OCALL;
 	case '[': return OARRAY;
-	default: return 0;
+	default:
+		return 0;
 	}
 }
 
@@ -1521,6 +1541,7 @@ deletenode(Node *node)
 	} else if (node->kind == ASTMT) {
 		if (node->lhs)
 			deletenode(node->lhs);
+
 	} else {
 		if (node->rhs)
 			deletenode(node->rhs);
@@ -1712,6 +1733,7 @@ deferfuncenv(Source *source, int keydeclinfunc)
 	if (funcenv) {
 		if (!funcenv->pending) {
 			funcenv->pending = true;
+
 			if (!source->pendingenvhead) {
 				source->pendingenvtail = funcenv;
 				source->pendingenvhead = funcenv;
@@ -1965,6 +1987,7 @@ checkend(Source *source, bool hastail, int needindent,
 {
 	if (getkind(source) == '\n') {
 		gettok(source);
+
 		if (getkind(source) == ';') {
 			error(getloc(source), expecterrmsg);
 			gettok(source);
@@ -2017,12 +2040,10 @@ stmtlist(Source *source, int indent, EnvKind envkind,
 	for (;;) {
 		Node *stmt;
 
-		if (checkend(source, !!tail, needindent,
-				"expected expression"))
+		if (checkend(source, !!tail, needindent, "expected expression"))
 			break;
 
 		stmt = exprlist(source, false, NULL);
-
 		stmt = tokennode(source, stmt);
 		stmt->kind = ASTMT;
 
@@ -2139,6 +2160,7 @@ redodeclaration:
 		gettok(source);
 
 		if (tryreadtype && (envkind == SSTRUCT || envkind == SUNION)) {
+
 			if (!isbasicdelimiter(getkind(source)) &&
 			    getkind(source) != '(')
 			{
@@ -2202,6 +2224,7 @@ redodeclaration:
 		result = tokennode(source, NULL);
 		result->kind = 'T';
 		result->type = ty;
+
 		return result;
 	}
 
@@ -2520,6 +2543,7 @@ readatom(Source *source, int flags)
 			gettok(source);
 			lhs = declaration(source, gettype(source, type), false);
 		} while (0);
+
 		break;
 
 	case 'N':
@@ -2531,6 +2555,7 @@ readatom(Source *source, int flags)
 		if (flags & QCONST) {
 			/* TODO(m21c): const - conversion */
 		}
+
 		break;
 
 	case KVAR:
@@ -2579,6 +2604,7 @@ readatom(Source *source, int flags)
 		} else {
 			lhs->lhs = readatom(source, 0);
 		}
+
 		break;
 
 	case KBITCAST:
@@ -2627,6 +2653,7 @@ readatom(Source *source, int flags)
 		/* if is atom */
 		if (!isdelimiter(source->tok.kind))
 			lhs->rhs = exprlist(source, false, NULL);
+
 		break;
 
 	case KDO:
@@ -2659,7 +2686,6 @@ readatom(Source *source, int flags)
 		gettok(source);
 		lhs->u.payload = readexpr(source, POR);
 		lhs->lhs = stmtlist(source, indent, SWHILE, NULL, false);
-
 		goto joinelse;
 
 	case KIF:
@@ -2718,6 +2744,7 @@ readatom(Source *source, int flags)
 				error(getloc(source), "expected identifier");
 
 			lhs->rhs = tokennode(source, NULL);
+
 		} else if (getkind(source) == '(') {
 			gettok(source);
 
@@ -2728,6 +2755,7 @@ readatom(Source *source, int flags)
 
 			expect(source, ')', "expected ')'");
 			continue;
+
 		} else if (getkind(source) == '[') {
 			gettok(source);
 
@@ -3097,6 +3125,7 @@ conv(Node *node)
 
 	if (ty->kind == TINFER)
 		return wrap(prim + TINT, node);
+
 	if (ty->kind == TUINFER)
 		return wrap(prim + TUINT, node);
 
@@ -3179,12 +3208,14 @@ resolvepending(Env *env, Node *expr)
 	if (!decl) {
 		error(&expr->loc, "'%s' undeclared",
 			getstring(idents, expr->u.key));
+
 		return expr;
 	}
 
 	if (decl->kind != DVAR && decl->kind != DFUNCTION) {
 		error(&expr->loc, "'%s' is not a variable nor a function",
 			getstring(idents, expr->u.key));
+
 		return expr;
 	}
 
@@ -3462,11 +3493,13 @@ typecheck(Env *env, Node *expr)
 	advancestmt:
 		lhs = typecheck(env, lhs);
 		rhs->lhs = lhs;
+
 		if (rhs->rhs) {
 			assert(rhs->rhs->kind == ASTMT);
 			rhs = rhs->rhs, lhs = rhs->lhs;
 			goto advancestmt;
 		}
+
 		return expr;
 
 	case ADECL:
@@ -3721,7 +3754,6 @@ foldexpr(Env *env, Node *expr)
 		assert(expr->u.env);
 
 		expr->lhs = foldexpr(expr->u.env, expr->lhs);
-
 		return expr;
 
 	case ASTMT:
@@ -3729,11 +3761,13 @@ foldexpr(Env *env, Node *expr)
 	advancestmt:
 		lhs = foldexpr(env, lhs);
 		rhs->lhs = lhs;
+
 		if (rhs->rhs) {
 			assert(rhs->rhs->kind == ASTMT);
 			rhs = rhs->rhs, lhs = rhs->lhs;
 			goto advancestmt;
 		}
+
 		return expr;
 
 	case ACOMMA:
@@ -3767,7 +3801,6 @@ foldexpr(Env *env, Node *expr)
 
 		deletenode(lhs);
 		expr->kind = 'N';
-
 		return expr;
 
 	case ACONV:
@@ -3836,7 +3869,7 @@ promptenvpath(Env* currenv)
 			envstring = getstring(idents, key);
 		}
 
-		fprintf(stdout, "%s/", envstring);
+		fprintf(stdout, "# scope: %s/", envstring);
 	}
 }
 
@@ -3845,13 +3878,15 @@ tryprompt(Source *source, const char ch)
 {
 	if (source->handlereplprompt) {
 		Env *currenv = source->currenv;
+
 		if (ch == '.' && currenv && currenv->kind != STOPLEVEL) {
-			fputs("\e[34m", stdout);
+			fputs("\e[1;30m", stdout);
 			promptenvpath(currenv);
 			fprintf(stdout, "\n\e[35m%c \e[0m", ch);
 		} else {
 			fprintf(stdout, "\e[35m%c \e[0m", ch);
 		}
+
 	} else if (source->filein == stdin) {
 		source->handlereplprompt = true;
 	}
@@ -4025,6 +4060,7 @@ printtypetail(FILE *out, Type *type, int indent)
 		} else {
 			n += printtypetail(out, type->u.rtarget, indent);
 		}
+
 		break;
 
 	#define typecase(type, str) \
@@ -4341,6 +4377,7 @@ printexpr(FILE *out, Node *expr, int indent)
 					putc(' ', out), ++n;
 				}
 			}
+
 			n += printoperant(out, expr->lhs, PUNARY, false, indent);
 		}
 	} else {
@@ -4381,6 +4418,7 @@ printexpr(FILE *out, Node *expr, int indent)
 					n += fprintf(out, "true");
 				else
 					n += fprintf(out, "0x%016lx", expr->u.u);
+
 				break;
 
 			case TPTR:
@@ -4396,6 +4434,7 @@ printexpr(FILE *out, Node *expr, int indent)
 				break;
 
 			}
+
 			break;
 
 		case 'S':
@@ -4499,6 +4538,7 @@ printexpr(FILE *out, Node *expr, int indent)
 
 			if (expr->rhs)
 				n += printclause(out, expr->rhs, indent);
+
 			break;
 
 		case ASTMT:
@@ -4518,6 +4558,7 @@ printexpr(FILE *out, Node *expr, int indent)
 				expr = expr->rhs;
 				goto advancestmt;
 			}
+
 			break;
 
 		case ASCOPE:
@@ -4553,6 +4594,7 @@ printexpr(FILE *out, Node *expr, int indent)
 		default:
 			n += highlight(out, HLINFO);
 			n += fprintf(out, "node(%u)", expr->kind);
+
 			if (expr->lhs) {
 				n += fprintf(out, " -> ");
 				n += printsubexpr(out, expr->lhs, true, indent);
@@ -4764,8 +4806,11 @@ main(int argc, char **argv)
 
 		if (source->lastkind != ';' && source->lastkind != '\n') {
 			error(getloc(source), "expected new line");
-			while (getkind(source) != ';' && getkind(source) != '\n' && getkind(source) != 0)
+			while (getkind(source) != ';' &&
+			       getkind(source) != '\n' && getkind(source) != 0)
+			{
 				gettok(source);
+			}
 
 			if (source->filein == stdin) {
 				highlight(stdout, HLPROMPT);

	Aria A low-level systems programming language
	git clone git://git.m21c.me/Aria.git
	Log \| Files \| Refs \| LICENSE