fully lex test

author: kdx <kikoodx@paranoici.org> 2023-01-19 04:16:55 +0100
committer: kdx <kikoodx@paranoici.org> 2023-01-19 04:18:04 +0100
commit: 8150144a228692272fd181d2ebc1041b45fde032 (patch)
tree: 0226a0600864c9ed8515a5d84aece35852e80c4b
parent: b600369367193c867013d6ac56aa3e750b66f6be (diff)
download: golem-8150144a228692272fd181d2ebc1041b45fde032.tar.gz
7 files changed, 239 insertions, 2 deletions
diff --git a/Makefile b/Makefile
index c4f2d14..3ce0d8c 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ LD      := $(CC)
 SRC     := $(wildcard *.c)
 OBJ     := $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(SRC)))
 NAME    := restruct
-CFLAGS  := -Wall -Wextra -std=c99 -pedantic
+CFLAGS  := -g -O0 -Wall -Wextra -std=c99 -pedantic
 LDFLAGS :=
 
 all: $(NAME)
diff --git a/Token.c b/Token.c
new file mode 100644
index 0000000..0f259a1
--- /dev/null
+++ b/Token.c
@@ -0,0 +1,37 @@
+#include "Token.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+void token_free(Token *tok)
+{
+	switch (tok->type) {
+	case TOK_KEYWORD:
+		if (tok != NULL && tok->v.s != NULL)
+			free(tok->v.s);
+	default:
+		break;
+	}
+}
+
+void token_print(const Token *tok)
+{
+	printf("%d ", tok->line);
+	switch (tok->type) {
+	case TOK_KEYWORD: printf("TOK_KEYWORD     %s\n", tok->v.s); break;
+	case TOK_STRING: printf("TOK_STRING      \"%s\"\n", tok->v.s); break;
+	case TOK_INTEGER: printf("TOK_INTEGER     %d\n", tok->v.i); break;
+	case TOK_PAREN_OPEN: printf("TOK_PAREN_OPEN\n"); break;
+	case TOK_PAREN_CLOS: printf("TOK_PAREN_CLOS\n"); break;
+	case TOK_CURL_OPEN: printf("TOK_CURL_OPEN\n"); break;
+	case TOK_CURL_CLOS: printf("TOK_CURL_CLOS\n"); break;
+	case TOK_SQUAR_OPEN: printf("TOK_SQUAR_OPEN\n"); break;
+	case TOK_SQUAR_CLOS: printf("TOK_SQUAR_CLOS\n"); break;
+	case TOK_SEMICOLON: printf("TOK_SEMICOLON\n"); break;
+	case TOK_ASSIGN: printf("TOK_ASSIGN\n"); break;
+	case TOK_COMMA: printf("TOK_COMMA\n"); break;
+	case TOK_COMP_LESS: printf("TOK_COMP_LESS\n"); break;
+	case TOK_MODULO: printf("TOK_MODULO\n"); break;
+	case TOK_INCREMENT: printf("TOK_INCREMENT\n"); break;
+	default: printf("token_print TODO: type %d\n", tok->type); break;
+	}
+}
diff --git a/Token.h b/Token.h
new file mode 100644
index 0000000..f11753f
--- /dev/null
+++ b/Token.h
@@ -0,0 +1,40 @@
+#pragma once
+#include <stdbool.h>
+
+enum {
+	TOK_NONE,
+	TOK_KEYWORD,
+	TOK_STRING,
+	TOK_INTEGER,
+	/* single char toks */
+	TOK_PAREN_OPEN,
+	TOK_PAREN_CLOS,
+	TOK_CURL_OPEN,
+	TOK_CURL_CLOS,
+	TOK_SQUAR_OPEN,
+	TOK_SQUAR_CLOS,
+	TOK_SEMICOLON,
+	TOK_ASSIGN,
+	TOK_COMMA,
+	TOK_COMP_LESS,
+	TOK_MODULO,
+	/* double char toks */
+	TOK_INCREMENT,
+};
+
+union TokenValue {
+	char *s;
+	char c;
+	int i;
+	double d;
+	bool b;
+};
+
+typedef struct Token {
+	unsigned int type;
+	unsigned int line;
+	union TokenValue v;
+} Token;
+
+void token_free(Token *tok);
+void token_print(const Token *tok);
diff --git a/lexer.c b/lexer.c
new file mode 100644
index 0000000..a68b93e
--- /dev/null
+++ b/lexer.c
@@ -0,0 +1,142 @@
+#include "lexer.h"
+#include <ctype.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+static Token *resize_toks(Token *toks, size_t *size)
+{
+	*size += 128;
+	Token *const new_toks = realloc(toks, sizeof(Token) * (*size + 1));
+	if (new_toks == NULL) {
+		perror("resize_toks");
+		lexer_free(toks);
+		return NULL;
+	}
+	return new_toks;
+}
+
+static unsigned int one_wide_tok(const char *s)
+{
+	switch (*s) {
+	case '(': return TOK_PAREN_OPEN;
+	case ')': return TOK_PAREN_CLOS;
+	case '{': return TOK_CURL_OPEN;
+	case '}': return TOK_CURL_CLOS;
+	case '[': return TOK_SQUAR_OPEN;
+	case ']': return TOK_SQUAR_CLOS;
+	case ';': return TOK_SEMICOLON;
+	case '=': return TOK_ASSIGN;
+	case ',': return TOK_COMMA;
+	case '<': return TOK_COMP_LESS;
+	case '%': return TOK_MODULO;
+	default:  return TOK_NONE;
+	}
+}
+
+#define PAIR(a, b) ((int)(a) + (int)(b) * 256)
+static unsigned int two_wide_tok(const char *s)
+{
+	switch (PAIR(s[0], s[1])) {
+	case PAIR('+', '+'): return TOK_INCREMENT;
+	default:             return TOK_NONE;
+	}
+}
+
+Token *lexer(const char *s)
+{
+	size_t size = 128;
+	Token *toks = calloc(size + 1, sizeof(Token));
+	size_t tok_i = 0;
+	size_t line = 1;
+	if (toks == NULL) {
+		perror("lexer");
+		return NULL;
+	}
+	while (*s != '\0') {
+		/* Skip whitespaces. */
+		while (isspace(*s)) {
+			if (*s == '\n')
+				line += 1;
+			s += 1;
+		}
+		if (*s == '\0')
+			break;
+		toks[tok_i].line = line;
+		if (*s == '"') {
+			const char *end = strchr(s + 1, '"');
+			if (end == NULL) {
+				printf("unclosed string\n");
+				lexer_free(toks);
+				return NULL;
+			}
+			size_t len = end - s - 1;
+			toks[tok_i].v.s = calloc(1, len + 1);
+			if (toks[tok_i].v.s == NULL) {
+				perror("lexer");
+				lexer_free(toks);
+				return NULL;
+			}
+			toks[tok_i].type = TOK_STRING;
+			strncpy(toks[tok_i].v.s, s + 1, len);
+			tok_i += 1;
+			s = end + 1;
+		} else if (isalpha(*s) || *s == '_') {
+			size_t len = 0;
+			while (isalnum(s[len]) || s[len] == '_')
+				len += 1;
+			toks[tok_i].v.s = calloc(1, len + 1);
+			if (toks[tok_i].v.s == NULL) {
+				perror("lexer");
+				lexer_free(toks);
+				return NULL;
+			}
+			strncpy(toks[tok_i].v.s, s, len);
+			toks[tok_i].type = TOK_KEYWORD;
+			tok_i += 1;
+			s += len;
+		} else if (isdigit(*s)) {
+			size_t len = 0;
+			while (isdigit(s[len]))
+				len += 1;
+			toks[tok_i].v.i = atoi(s);
+			toks[tok_i].type = TOK_INTEGER;
+			tok_i += 1;
+			s += len;
+		} else if (two_wide_tok(s) != TOK_NONE) {
+			toks[tok_i].type = two_wide_tok(s);
+			toks[tok_i].v.c = *s;
+			tok_i += 1;
+			s += 2;
+		} else if (one_wide_tok(s) != TOK_NONE) {
+			toks[tok_i].type = one_wide_tok(s);
+			toks[tok_i].v.c = *s;
+			tok_i += 1;
+			s += 1;
+		} else {
+			printf("wtf is this shit? %c\n", *s);
+			s += 1;
+		}
+		if (tok_i == size) {
+			toks = resize_toks(toks, &size);
+			if (toks == NULL)
+				return NULL;
+		}
+	}
+	return toks;
+}
+
+void lexer_free(Token *toks)
+{
+	if (toks != NULL) {
+		for (Token *tok = toks; tok->type != TOK_NONE; tok += 1)
+			token_free(tok);
+		free(toks);
+	}
+}
+
+void lexer_print(const Token *toks)
+{
+	for (const Token *tok = toks; tok->type != TOK_NONE; tok += 1)
+		token_print(tok);
+}
diff --git a/lexer.h b/lexer.h
new file mode 100644
index 0000000..150a7ad
--- /dev/null
+++ b/lexer.h
@@ -0,0 +1,6 @@
+#pragma once
+#include "Token.h"
+
+Token *lexer(const char *s);
+void lexer_free(Token *toks);
+void lexer_print(const Token *toks);
diff --git a/main.c b/main.c
index 2f334e9..a1f9439 100644
--- a/main.c
+++ b/main.c
@@ -1,4 +1,5 @@
 #include "drain.h"
+#include "lexer.h"
 #include <stdio.h>
 #include <stdlib.h>
 
@@ -19,7 +20,13 @@ int main(int argc, char **argv)
 		fprintf(stderr, "failed to drain '%s'\n", argv[1]);
 		return 1;
 	}
-	printf("%s", data);
+	Token *toks = lexer(data);
 	free(data);
+	if (toks == NULL) {
+		fprintf(stderr, "lexer failed\n");
+		return 1;
+	}
+	lexer_print(toks);
+	lexer_free(toks);
 	return 0;
 }
diff --git a/test b/test
new file mode 100644
index 0000000..f49dbe5
--- /dev/null
+++ b/test
@@ -0,0 +1,5 @@
+int main(void) {
+	for (int i = 0; i < 10; i++) {
+		printf("%d\n", i);
+	}
+}
author	kdx <kikoodx@paranoici.org>	2023-01-19 04:16:55 +0100
committer	kdx <kikoodx@paranoici.org>	2023-01-19 04:18:04 +0100
commit	8150144a228692272fd181d2ebc1041b45fde032 (patch)
tree	0226a0600864c9ed8515a5d84aece35852e80c4b
parent	b600369367193c867013d6ac56aa3e750b66f6be (diff)
download	golem-8150144a228692272fd181d2ebc1041b45fde032.tar.gz