core: Add lexer + parser groundwork

Signed-off-by: Ian Moffett <ian@mirocom.org>
2026-05-23 02:21:09 -04:00
parent 659dd38932
commit 74e2e8c772
6 changed files with 232 additions and 0 deletions
@@ -6,6 +6,7 @@
 #include <stdio.h>
 #include <unistd.h>
 #include "cescal/state.h"
+#include "cescal/parser.h"
 #include "cescal/log.h"

 static void
@@ -28,6 +29,10 @@ compile(const char *pathname)
        return -1;
    }

+    if (parser_parse(&st) < 0) {
+        return -1;
+    }
+
    state_close(&st);
    return 0;
 }
@@ -0,0 +1,84 @@
+#include <errno.h>
+#include <stdbool.h>
+#include "cescal/lexer.h"
+#include "cescal/log.h"
+
+/*
+ * Returns true if the given character is a whitespace
+ *
+ * @c: Character to check
+ */
+static inline bool
+lexer_is_ws(char c)
+{
+    switch (c) {
+    case '\t':
+    case '\n':
+    case ' ':
+    case '\f':
+    case '\r':
+        return true;
+    }
+
+    return false;
+}
+
+/*
+ * Consume a single character from the input source file and
+ * optionally skip whitespace
+ *
+ * @state: Compiler state
+ * @skip_ws: If true skip whitespace
+ */
+static char
+lexer_consume_single(struct cescal_state *state, bool skip_ws)
+{
+    char c;
+
+    if (state == NULL) {
+        return '\0';
+    }
+
+    while ((c = readbuf_read(&state->rb, state->in_fd)) != '\0') {
+        if (lexer_is_ws(c)) {
+            continue;
+        }
+
+        return c;
+    }
+
+    return '\0';
+}
+
+int
+lexer_nom(struct cescal_state *state, struct token *res)
+{
+    char c;
+
+    if (state == NULL || res == NULL) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    if ((c = lexer_consume_single(state, true)) == '\0') {
+        return -1;
+    }
+
+    switch (c) {
+    case '(':
+        res->type = TT_LPAREN;
+        res->c = c;
+        return 0;
+    case ')':
+        res->type = TT_RPAREN;
+        res->c = c;
+        return 0;
+    case ',':
+        res->type = TT_COMMA;
+        res->c = c;
+        return 0;
+    }
+
+    cc_error("got unknown token '%c'\n", c);
+    return -1;
+}
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2026, Chloe M.
+ * Provided under the BSD-3 clause
+ */
+
+#include <stdint.h>
+#include <errno.h>
+#include "cescal/log.h"
+#include "cescal/parser.h"
+#include "cescal/state.h"
+#include "cescal/lexer.h"
+
+/* Symbolic token */
+#define symtok(tok) \
+    "[" tok "]"
+
+/* Quoted token */
+#define qtok(tok) \
+    "'" tok "'"
+
+/* Convert token to string */
+#define tokstr1(tt) \
+    toktab[(tt)]
+
+/* Convert token to string */
+#define tokstr(tok) \
+    toktab[(tok)->type]
+
+/*
+ * Converts numeric tokens into human readable strings
+ */
+static const char *toktab[] = {
+    [TT_NONE]   = symtok("none"),
+    [TT_IDENT]  = symtok("ident"),
+    [TT_INTLIT] = symtok("number"),
+    [TT_LPAREN] = qtok("("),
+    [TT_RPAREN] = qtok(")"),
+    [TT_COMMA]  = qtok(","),
+    [TT_RETURN] = qtok("return"),
+    [TT_PUB]    = qtok("pub"),
+    [TT_PROC]   = qtok("proc"),
+    [TT_BEGIN]  = qtok("begin"),
+    [TT_END]    = qtok("end")
+};
+
+int
+parser_parse(struct cescal_state *state)
+{
+    struct token tok;
+
+    if (state == NULL) {
+        errno = EINVAL;
+        return -1;
+    }
+
+    while (lexer_nom(state, &tok) == 0) {
+        cc_trace("got token %s\n", tokstr(&tok));
+    }
+
+    return 0;
+}