extracted tokenize functions from gettok() for better code organization - Aria

commit 8480c7bcaea20aae2597b6a20387405ceba7d4b3
parent 0b7f291572407fee68bac011a6f4a2c509a55f21
Author: m21c  <ho*******@gmail.com>
Date:   Sat,  2 Oct 2021 14:16:41 +0200

extracted tokenize functions from gettok() for better code organization

Diffstat:
M compiler.c  | 372 +++++++++++++++++++++++++++++++++++++++----------------------------------------

1 file changed, 185 insertions(+), 187 deletions(-)
diff --git a/compiler.c b/compiler.c
@@ -943,63 +943,8 @@ error(SrcLoc *loc, const char *fmt, ...)
 	((indent) + (source)->tabwidth - ((indent) % (source)->tabwidth))
 
 static int
-gettok(Source *source)
+tokenizealphanumeric(Source *source, register int c0)
 {
-	register int c0 = (uchar) source->line[source->currloc.column];
-	static bool hasnewline = false;
-
-	source->lastkind = source->tok.kind;
-
-skipwhite:
-	if (hasnewline) {
-		if (!mygetline(source)) {
-			source->lastindent = 0;
-			return source->tok.kind = 0;
-		}
-
-		c0 = source->line[(source->currloc.column = 0)];
-	}
-
-	if (source->currloc.column) {
-		while (isspace(c0))
-			c0 = source->line[++source->currloc.column];
-
-	} else {
-		source->lastindent = 0;
-		while (isspace(c0)) {
-			if (c0 == '\t') {
-				source->lastindent = nextindent(
-					source,
-					source->lastindent
-				);
-			} else {
-				++source->lastindent;
-			}
-
-			c0 = source->line[++source->currloc.column];
-		}
-	}
-
-	source->tok.type = prim + TUNDEFINED;
-	source->tok.u.u = 0;
-	source->tok.lhs = NULL;
-	source->tok.rhs = NULL;
-	source->tok.loc.column = source->currloc.column;
-
-	/* get line */
-	if (!c0 || c0 == '#') {
-		if (hasnewline) {
-			goto skipwhite;
-		} else {
-			hasnewline = true;
-			return source->tok.kind = '\n';
-		}
-	}
-
-	hasnewline = false;
-
-	/* identifier or keyword */
-	if (isalpha(c0) || c0 == '_') {
 		int keyword;
 
 		while (isalnum(c0) || c0 == '_')
@@ -1039,9 +984,90 @@ skipwhite:
 		return source->tok.kind = 'I';
 	}
 
-	/* number literal */
-	if (isdigit(c0) || (c0 == '.' &&
-	    isdigit(source->line[source->currloc.column+1])))
+static Type *
+suffixfloattype(Source *source, const char *end)
+{
+	Type *ty = prim + TDOUBLE;
+
+	if (*end == 0)
+		return ty;
+
+	/* FIXME(m21c): r-suffix might conflict with radix */
+	if ((*end == 'f' || *end == 'F') && !end[1]) {
+		ty = prim + TFLOAT;
+
+	} else if (*end == 'l' || *end == 'L') {
+		ty = prim + TDOUBLE;
+
+		if (end[1])
+			goto errorfloat;
+	} else if (!mystrcasecmp(end, "f32") || !mystrcasecmp(end, "r32")) {
+		ty = prim + TF32;
+
+	} else if (!mystrcasecmp(end, "f64") || !mystrcasecmp(end, "r64")) {
+		ty = prim + TF64;
+
+	} else {
+	errorfloat:
+		error(&source->currloc, "invalid floating-point format");
+	}
+
+	return ty;
+}
+
+static Type *
+suffixinttype(Source *source, const char *end)
+{
+	int typeid = TUINT - TINT;
+
+	switch (*end) {
+	case 0:
+		return prim + TINFER;
+
+	case 's': case 'S': case 'i': case 'I':
+		typeid = 0;
+
+	case 'u': case 'U':
+		++end;
+		if (*end == 0) {
+			return prim + (typeid + TINFER);
+		} else if (*end == '8') {
+			typeid += TS8;
+
+			if (end[1])
+				goto errorint;
+
+			return prim + typeid;
+		} else if (!strcmp(end, "16")) {
+			return prim + (typeid + TS16);
+		} else if (!strcmp(end, "32")) {
+			return prim + (typeid + TS32);
+		} else if (!strcmp(end, "64")) {
+			return prim + (typeid + TS64);
+		} else if (!mystrcasecmp(end, "sz")) {
+			return prim + (typeid + TSSIZE);
+		}
+
+	default:
+		if (!mystrcasecmp(end, "ll")) {
+			return prim + (typeid + TLLONG);
+		} else if (*end == 'l' || *end == 'L') {
+			typeid += TLONG;
+
+			if (end[1])
+				goto errorint;
+
+			return prim + typeid;
+		}
+	}
+
+errorint:
+	error(&source->currloc, "invalid integer format");
+	return prim + TINT;
+}
+
+static int
+tokenizenumber(Source *source, register int c0)
 	{
 		int l = c0, t = source->line[source->currloc.column+1], i, j;
 		bool hasdec = false, hasexp = false;
@@ -1100,39 +1126,8 @@ skipwhite:
 		    strpbrk(source->stringbuf, "eEfF")))
 		{
 			source->tok.u.d = strtod(source->stringbuf, &end);
-			source->tok.type = prim + TDOUBLE;
-
-			if (*end != 0) {
-				/* FIXME(m21c): r-suffix might conflict with radix */
-				if ((*end == 'f' || *end == 'F') && !end[1]) {
-					source->tok.type = prim + TFLOAT;
-
-				} else if (*end == 'l' || *end == 'L') {
-					source->tok.type = prim + TDOUBLE;
-
-					if (end[1])
-						goto errorfloat;
-				} else if (!mystrcasecmp(end, "f32") ||
-				           !mystrcasecmp(end, "r32"))
-				{
-					source->tok.type = prim + TF32;
-
-				} else if (!mystrcasecmp(end, "f64") ||
-				           !mystrcasecmp(end, "r64"))
-				{
-					source->tok.type = prim + TF64;
-
+		source->tok.type = suffixfloattype(source, end);
 				} else {
-				errorfloat:
-					error(
-						&source->currloc,
-						"invalid floating-point format"
-					);
-				}
-			}
-		} else {
-			int typeid = TUINT - TINT;
-
 			if (mystrncasecmp(source->stringbuf, "0b", 2) == 0) {
 				source->tok.u.u = strtoull(
 					source->stringbuf + 2,
@@ -1147,73 +1142,15 @@ skipwhite:
 
 			}
 
-			switch (*end) {
-			case 0:
-				typeid = TINFER;
-				break;
-
-			case 's': case 'S':
-			case 'i': case 'I':
-				typeid = 0;
-
-			case 'u': case 'U':
-				++end;
-				if (*end == 0) {
-					typeid += TINFER;
-
-					break;
-				} else if (*end == '8') {
-					typeid += TS8;
-
-					if (end[1])
-						goto errorint;
-					break;
-				} else if (!strcmp(end, "16")) {
-					typeid += TS16;
-
-					break;
-				} else if (!strcmp(end, "32")) {
-					typeid += TS32;
-
-					break;
-				} else if (!strcmp(end, "64")) {
-					typeid += TS64;
-
-					break;
-				} else if (!mystrcasecmp(end, "sz")) {
-					typeid += TSSIZE;
-
-					break;
-				}
-
-			default:
-				if (!mystrcasecmp(end, "ll")) {
-					typeid += TLLONG;
-
-				} else if (*end == 'l' || *end == 'L') {
-					typeid += TLONG;
-
-					if (end[1])
-						goto errorint;
-				} else {
-				errorint:
-					error(
-						&source->currloc,
-						"invalid integer format"
-					);
-
-					typeid = TINT;
-				}
-			}
-
-			source->tok.type = prim + typeid;
+		source->tok.type = suffixinttype(source, end);
 		}
 
 		return source->tok.kind = 'N';
 	}
 
-	/* string & character-literal */
-	if (c0 == '"' || c0 == '\'') {
+static int
+tokenizestring(Source *source, register int c0)
+{
 		int delim = c0, j;
 
 		c0 = source->line[++source->currloc.column];
@@ -1254,11 +1191,8 @@ skipwhite:
 					goto stringeol;
 
 				default:
-					error(
-						&source->currloc,
-						"invalid escape sequence '\\%c'",
-						c0
-					);
+				error(&source->currloc,
+					"invalid escape sequence '\\%c'", c0);
 				}
 			}
 
@@ -1271,10 +1205,7 @@ skipwhite:
 
 		if (c0 == 0) {
 		stringeol:
-			error(
-				&source->currloc,
-				"unexpected end-of-line"
-			);
+		error(&source->currloc, "unexpected end-of-line");
 
 			return source->tok.kind = '\n';
 		}
@@ -1288,20 +1219,87 @@ skipwhite:
 		);
 
 		return source->tok.kind = 'S';
+}
+
+static int
+gettok(Source *source)
+{
+	register int c0 = (uchar) source->line[source->currloc.column];
+	static bool hasnewline = false;
+
+	source->lastkind = source->tok.kind;
+
+skipwhite:
+	if (hasnewline) {
+		if (!mygetline(source)) {
+			source->lastindent = 0;
+			return source->tok.kind = 0;
+		}
+
+		c0 = source->line[(source->currloc.column = 0)];
+	}
+
+	if (source->currloc.column) {
+		while (isspace(c0))
+			c0 = source->line[++source->currloc.column];
+
+	} else {
+		source->lastindent = 0;
+		while (isspace(c0)) {
+			if (c0 == '\t') {
+				source->lastindent = nextindent(
+					source,
+					source->lastindent
+				);
+			} else {
+				++source->lastindent;
+			}
+
+			c0 = source->line[++source->currloc.column];
+		}
+	}
+
+	source->tok.type = prim + TUNDEFINED;
+	source->tok.u.u = 0;
+	source->tok.lhs = NULL;
+	source->tok.rhs = NULL;
+	source->tok.loc.column = source->currloc.column;
+
+	/* get line */
+	if (!c0 || c0 == '#') {
+		if (hasnewline) {
+			goto skipwhite;
+		} else {
+			hasnewline = true;
+			return source->tok.kind = '\n';
+		}
+	}
+
+	hasnewline = false;
+
+	/* identifier or keyword */
+	if (isalpha(c0) || c0 == '_') {
+		return tokenizealphanumeric(source, c0);
+	}
+
+	/* number literal */
+	if (isdigit(c0) || (c0 == '.' &&
+	    isdigit(source->line[source->currloc.column+1])))
+	{
+		return tokenizenumber(source, c0);
+	}
+
+	/* string & character-literal */
+	if (c0 == '"' || c0 == '\'') {
+		return tokenizestring(source, c0);
 	}
 
 	/* delimiters */
 	switch (c0) {
-	case ',':
-	case ';':
-	case '@':
-	case ':':
-	case '{':
-	case '}':
-	case ']':
-	case ')':
-	case '[':
-	case '(':
+	case ',': case ';': case '@': case ':':
+	case '{': case '}':
+	case ']': case '[':
+	case '(': case ')':
 		++source->currloc.column;
 		return source->tok.kind = c0;
 	}
@@ -1315,30 +1313,30 @@ skipwhite:
 	switch (source->line[source->currloc.column++]) {
 	case '.':
 		/* tok.kind = select('.', ORANGE, ODISP); */
-		source->tok.kind = ODISP;
+		c0 = ODISP;
 		goto joinop;
 
 	case '*':
-		source->tok.kind = select('=', OMULA, OMUL);
+		c0 = select('=', OMULA, OMUL);
 		goto joinop;
 
 	case '/':
-		source->tok.kind = select('=', ODIVA, ODIV);
+		c0 = select('=', ODIVA, ODIV);
 		goto joinop;
 
 	case '%':
-		source->tok.kind = select('=', OMODA, OMOD);
+		c0 = select('=', OMODA, OMOD);
 		goto joinop;
 
 	case '<':
-		source->tok.kind = select('=', OLEQ,
+		c0 = select('=', OLEQ,
 				select('<',
 					select('=', OLSHA, OLSH),
 				OLET));
 		goto joinop;
 
 	case '>':
-		source->tok.kind = select('=', OGEQ,
+		c0 = select('=', OGEQ,
 				select('>',
 					select('>',
 						select('=', OARSHA, OARSH),
@@ -1347,37 +1345,37 @@ skipwhite:
 		goto joinop;
 
 	case '&':
-		source->tok.kind = select('=', OANDA, select('&', OLAND, OBAND));
+		c0 = select('=', OANDA, select('&', OLAND, OBAND));
 		goto joinop;
 
 	case '+':
-		source->tok.kind = select('=', OADDA, select('+', OSUFINC, OADD));
+		c0 = select('=', OADDA, select('+', OSUFINC, OADD));
 		goto joinop;
 
 	case '-':
-		source->tok.kind = select('=', OSUBA, select('-', OSUFDEC, OSUB));
+		c0 = select('=', OSUBA, select('-', OSUFDEC, OSUB));
 		goto joinop;
 
 	case '|':
-		source->tok.kind = select('=', OORA, select('|', OLOR, OBOR));
+		c0 = select('=', OORA, select('|', OLOR, OBOR));
 		goto joinop;
 
 	case '^':
-		source->tok.kind = select('=', OXORA, OXOR);
+		c0 = select('=', OXORA, OXOR);
 		goto joinop;
 
 	case '!':
-		source->tok.kind = select('=', ONEQ, OLNOT);
+		c0 = select('=', ONEQ, OLNOT);
 		goto joinop;
 
 	case '~':
-		source->tok.kind = select('=', OFLIP, OBNOT);
+		c0 = select('=', OFLIP, OBNOT);
 		goto joinop;
 
 	case '=':
-		source->tok.kind = select('=', select('=', OIDENT, OEQU), OASS);
+		c0 = select('=', select('=', OIDENT, OEQU), OASS);
 	joinop:
-		return source->tok.kind;
+		return source->tok.kind = c0;
 
 	default:
 		error(&source->currloc, "invalid input character '%c'", c0);

	Aria A low-level systems programming language
	git clone git://git.m21c.me/Aria.git
	Log \| Files \| Refs \| LICENSE