From 8150144a228692272fd181d2ebc1041b45fde032 Mon Sep 17 00:00:00 2001 From: kdx Date: Thu, 19 Jan 2023 04:16:55 +0100 Subject: fully lex test --- Makefile | 2 +- Token.c | 37 +++++++++++++++++ Token.h | 40 ++++++++++++++++++ lexer.c | 142 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ lexer.h | 6 +++ main.c | 9 +++- test | 5 +++ 7 files changed, 239 insertions(+), 2 deletions(-) create mode 100644 Token.c create mode 100644 Token.h create mode 100644 lexer.c create mode 100644 lexer.h create mode 100644 test diff --git a/Makefile b/Makefile index c4f2d14..3ce0d8c 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ LD := $(CC) SRC := $(wildcard *.c) OBJ := $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(SRC))) NAME := restruct -CFLAGS := -Wall -Wextra -std=c99 -pedantic +CFLAGS := -g -O0 -Wall -Wextra -std=c99 -pedantic LDFLAGS := all: $(NAME) diff --git a/Token.c b/Token.c new file mode 100644 index 0000000..0f259a1 --- /dev/null +++ b/Token.c @@ -0,0 +1,37 @@ +#include "Token.h" +#include +#include + +void token_free(Token *tok) +{ + switch (tok->type) { + case TOK_KEYWORD: + if (tok != NULL && tok->v.s != NULL) + free(tok->v.s); + default: + break; + } +} + +void token_print(const Token *tok) +{ + printf("%d ", tok->line); + switch (tok->type) { + case TOK_KEYWORD: printf("TOK_KEYWORD %s\n", tok->v.s); break; + case TOK_STRING: printf("TOK_STRING \"%s\"\n", tok->v.s); break; + case TOK_INTEGER: printf("TOK_INTEGER %d\n", tok->v.i); break; + case TOK_PAREN_OPEN: printf("TOK_PAREN_OPEN\n"); break; + case TOK_PAREN_CLOS: printf("TOK_PAREN_CLOS\n"); break; + case TOK_CURL_OPEN: printf("TOK_CURL_OPEN\n"); break; + case TOK_CURL_CLOS: printf("TOK_CURL_CLOS\n"); break; + case TOK_SQUAR_OPEN: printf("TOK_SQUAR_OPEN\n"); break; + case TOK_SQUAR_CLOS: printf("TOK_SQUAR_CLOS\n"); break; + case TOK_SEMICOLON: printf("TOK_SEMICOLON\n"); break; + case TOK_ASSIGN: printf("TOK_ASSIGN\n"); break; + case TOK_COMMA: printf("TOK_COMMA\n"); break; + case TOK_COMP_LESS: printf("TOK_COMP_LESS\n"); break; + case TOK_MODULO: printf("TOK_MODULO\n"); break; + case TOK_INCREMENT: printf("TOK_INCREMENT\n"); break; + default: printf("token_print TODO: type %d\n", tok->type); break; + } +} diff --git a/Token.h b/Token.h new file mode 100644 index 0000000..f11753f --- /dev/null +++ b/Token.h @@ -0,0 +1,40 @@ +#pragma once +#include + +enum { + TOK_NONE, + TOK_KEYWORD, + TOK_STRING, + TOK_INTEGER, + /* single char toks */ + TOK_PAREN_OPEN, + TOK_PAREN_CLOS, + TOK_CURL_OPEN, + TOK_CURL_CLOS, + TOK_SQUAR_OPEN, + TOK_SQUAR_CLOS, + TOK_SEMICOLON, + TOK_ASSIGN, + TOK_COMMA, + TOK_COMP_LESS, + TOK_MODULO, + /* double char toks */ + TOK_INCREMENT, +}; + +union TokenValue { + char *s; + char c; + int i; + double d; + bool b; +}; + +typedef struct Token { + unsigned int type; + unsigned int line; + union TokenValue v; +} Token; + +void token_free(Token *tok); +void token_print(const Token *tok); diff --git a/lexer.c b/lexer.c new file mode 100644 index 0000000..a68b93e --- /dev/null +++ b/lexer.c @@ -0,0 +1,142 @@ +#include "lexer.h" +#include +#include +#include +#include + +static Token *resize_toks(Token *toks, size_t *size) +{ + *size += 128; + Token *const new_toks = realloc(toks, sizeof(Token) * (*size + 1)); + if (new_toks == NULL) { + perror("resize_toks"); + lexer_free(toks); + return NULL; + } + return new_toks; +} + +static unsigned int one_wide_tok(const char *s) +{ + switch (*s) { + case '(': return TOK_PAREN_OPEN; + case ')': return TOK_PAREN_CLOS; + case '{': return TOK_CURL_OPEN; + case '}': return TOK_CURL_CLOS; + case '[': return TOK_SQUAR_OPEN; + case ']': return TOK_SQUAR_CLOS; + case ';': return TOK_SEMICOLON; + case '=': return TOK_ASSIGN; + case ',': return TOK_COMMA; + case '<': return TOK_COMP_LESS; + case '%': return TOK_MODULO; + default: return TOK_NONE; + } +} + +#define PAIR(a, b) ((int)(a) + (int)(b) * 256) +static unsigned int two_wide_tok(const char *s) +{ + switch (PAIR(s[0], s[1])) { + case PAIR('+', '+'): return TOK_INCREMENT; + default: return TOK_NONE; + } +} + +Token *lexer(const char *s) +{ + size_t size = 128; + Token *toks = calloc(size + 1, sizeof(Token)); + size_t tok_i = 0; + size_t line = 1; + if (toks == NULL) { + perror("lexer"); + return NULL; + } + while (*s != '\0') { + /* Skip whitespaces. */ + while (isspace(*s)) { + if (*s == '\n') + line += 1; + s += 1; + } + if (*s == '\0') + break; + toks[tok_i].line = line; + if (*s == '"') { + const char *end = strchr(s + 1, '"'); + if (end == NULL) { + printf("unclosed string\n"); + lexer_free(toks); + return NULL; + } + size_t len = end - s - 1; + toks[tok_i].v.s = calloc(1, len + 1); + if (toks[tok_i].v.s == NULL) { + perror("lexer"); + lexer_free(toks); + return NULL; + } + toks[tok_i].type = TOK_STRING; + strncpy(toks[tok_i].v.s, s + 1, len); + tok_i += 1; + s = end + 1; + } else if (isalpha(*s) || *s == '_') { + size_t len = 0; + while (isalnum(s[len]) || s[len] == '_') + len += 1; + toks[tok_i].v.s = calloc(1, len + 1); + if (toks[tok_i].v.s == NULL) { + perror("lexer"); + lexer_free(toks); + return NULL; + } + strncpy(toks[tok_i].v.s, s, len); + toks[tok_i].type = TOK_KEYWORD; + tok_i += 1; + s += len; + } else if (isdigit(*s)) { + size_t len = 0; + while (isdigit(s[len])) + len += 1; + toks[tok_i].v.i = atoi(s); + toks[tok_i].type = TOK_INTEGER; + tok_i += 1; + s += len; + } else if (two_wide_tok(s) != TOK_NONE) { + toks[tok_i].type = two_wide_tok(s); + toks[tok_i].v.c = *s; + tok_i += 1; + s += 2; + } else if (one_wide_tok(s) != TOK_NONE) { + toks[tok_i].type = one_wide_tok(s); + toks[tok_i].v.c = *s; + tok_i += 1; + s += 1; + } else { + printf("wtf is this shit? %c\n", *s); + s += 1; + } + if (tok_i == size) { + toks = resize_toks(toks, &size); + if (toks == NULL) + return NULL; + } + } + return toks; +} + +void lexer_free(Token *toks) +{ + if (toks != NULL) { + for (Token *tok = toks; tok->type != TOK_NONE; tok += 1) + token_free(tok); + free(toks); + } +} + +void lexer_print(const Token *toks) +{ + for (const Token *tok = toks; tok->type != TOK_NONE; tok += 1) + token_print(tok); +} diff --git a/lexer.h b/lexer.h new file mode 100644 index 0000000..150a7ad --- /dev/null +++ b/lexer.h @@ -0,0 +1,6 @@ +#pragma once +#include "Token.h" + +Token *lexer(const char *s); +void lexer_free(Token *toks); +void lexer_print(const Token *toks); diff --git a/main.c b/main.c index 2f334e9..a1f9439 100644 --- a/main.c +++ b/main.c @@ -1,4 +1,5 @@ #include "drain.h" +#include "lexer.h" #include #include @@ -19,7 +20,13 @@ int main(int argc, char **argv) fprintf(stderr, "failed to drain '%s'\n", argv[1]); return 1; } - printf("%s", data); + Token *toks = lexer(data); free(data); + if (toks == NULL) { + fprintf(stderr, "lexer failed\n"); + return 1; + } + lexer_print(toks); + lexer_free(toks); return 0; } diff --git a/test b/test new file mode 100644 index 0000000..f49dbe5 --- /dev/null +++ b/test @@ -0,0 +1,5 @@ +int main(void) { + for (int i = 0; i < 10; i++) { + printf("%d\n", i); + } +} -- cgit v1.2.3