Cescal/core/lexer.c

/*
 * Copyright (c) 2026, Chloe M.
 * Provided under the BSD-3 clause
 */

#include <errno.h>
#include <stdbool.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include "cescal/lexer.h"
#include "cescal/log.h"
#include "cescal/ptrbox.h"

static inline void
lexer_putback(struct cescal_state *state, char c)
{
    if (state == NULL) {
        return;
    }

    state->lex_putback = c;
}

static inline char
lexer_putback_pop(struct cescal_state *state)
{
    char retc;

    if (state == NULL) {
        return '\0';
    }

    retc = state->lex_putback;
    state->lex_putback = '\0';
    return retc;
}

/*
 * Returns true if the given character is a whitespace
 *
 * @c: Character to check
 */
static inline bool
lexer_is_ws(char c)
{
    switch (c) {
    case '\t':
    case '\n':
    case ' ':
    case '\f':
    case '\r':
        return true;
    }

    return false;
}

/*
 * Consume a single character from the input source file and
 * optionally skip whitespace
 *
 * @state: Compiler state
 * @skip_ws: If true skip whitespace
 */
static char
lexer_consume_single(struct cescal_state *state, bool skip_ws)
{
    char c;

    if (state == NULL) {
        return '\0';
    }

    if ((c = lexer_putback_pop(state)) != '\0') {
        if (skip_ws && !lexer_is_ws(c)) {
            return c;
        }

        if (!skip_ws && lexer_is_ws(c)) {
            return c;
        }
    }

    while ((c = readbuf_read(&state->rb, state->in_fd)) != '\0') {
        if (lexer_is_ws(c) && skip_ws) {
            continue;
        }

        return c;
    }

    return '\0';
}

static int
lexer_scan_ident(struct cescal_state *state, char lc, struct token *res)
{
    char *buf, c;
    size_t bufsz, bufcap;

    if (state == NULL || res == NULL) {
        errno = EINVAL;
        return -1;
    }

    bufsz = 0;
    bufcap = 8;
    if ((buf = malloc(bufcap)) == NULL) {
        return -1;
    }

    if (lc != '_' && !isalpha(lc)) {
        cc_error("bad identifier\n");
        return -1;
    }

    buf[bufsz++] = lc;
    for (;;) {
        c = lexer_consume_single(state, false);
        if (c != '_' && !isalnum(c)) {
            lexer_putback(state, c);
            buf[bufsz] = '\0';
            break;
        }

        buf[bufsz++] = c;
        if (bufsz >= bufcap) {
            bufcap += 8;
            buf = realloc(buf, bufcap);
        }

        if (buf == NULL) {
            return -1;
        }
    }

    res->s = ptrbox_strdup(&state->ptrbox, buf);
    res->type = TT_IDENT;
    free(buf);
    return 0;
}

/*
 * Checks if an identifier token is actually a keyword
 *
 * @state:   Compiler state
 * @res:     Token result
 */
static void
lexer_check_kw(struct cescal_state *state, struct token *res)
{
    if (state == NULL || res == NULL) {
        return;
    }

    switch (*res->s) {
    case 'p':
        if (strcmp(res->s, "pub") == 0) {
            res->type = TT_PUB;
            return;
        }

        if (strcmp(res->s, "proc") == 0) {
            res->type = TT_PROC;
            return;
        }

        break;
    case 'b':
        if (strcmp(res->s, "begin") == 0) {
            res->type = TT_BEGIN;
            return;
        }

        break;
    case 'e':
        if (strcmp(res->s, "end") == 0) {
            res->type = TT_END;
            return;
        }

        break;
    case 'r':
        if (strcmp(res->s, "return") == 0) {
            res->type = TT_RETURN;
            return;
        }

        break;
    case 'u':
        if (strcmp(res->s, "u8") == 0) {
            res->type = TT_U8;
            return;
        }

        if (strcmp(res->s, "u16") == 0) {
            res->type = TT_U16;
            return;
        }

        if (strcmp(res->s, "u32") == 0) {
            res->type = TT_U32;
            return;
        }

        if (strcmp(res->s, "u64") == 0) {
            res->type = TT_U64;
            return;
        }

        break;
    }
}

/*
 * Check if an identifier token is actually a directive
 *
 * @state:   Compiler state
 * @res:     Token result is written here
 */
static int
lexer_check_direc(struct cescal_state *state, struct token *res)
{
    if (state == NULL || res == NULL) {
        errno = EINVAL;
        return -1;
    }

    switch (*res->s) {
    case 'd':
        if (strcmp(res->s, "define") == 0) {
            res->type = TT_DEFINE;
            return 0;
        }

        break;
    case 'i':
        if (strcmp(res->s, "ifndef") == 0) {
            res->type = TT_IFNDEF;
            return 0;
        }

        if (strcmp(res->s, "ifdef") == 0) {
            res->type = TT_IFDEF;
            return 0;
        }

        break;
    }

    return -1;
}

/*
 * Skip anything after a comment
 *
 * @state: Compiler state
 */
static void
lexer_skip_comment(struct cescal_state *state)
{
    char c;

    if (state == NULL) {
        return;
    }

    while ((c = lexer_consume_single(state, false)) != '\n') {
        if (c == '\0') {
            break;
        }
    }
}

int
lexer_nom(struct cescal_state *state, struct token *res)
{
    char c;

    if (state == NULL || res == NULL) {
        errno = EINVAL;
        return -1;
    }

    if ((c = lexer_consume_single(state, true)) == '\0') {
        return -1;
    }

    switch (c) {
    case '(':
        res->type = TT_LPAREN;
        res->c = c;
        return 0;
    case ')':
        res->type = TT_RPAREN;
        res->c = c;
        return 0;
    case ',':
        res->type = TT_COMMA;
        res->c = c;
        return 0;
    case ':':
        res->type = TT_COLON;
        res->c = c;
        return 0;
    case '+':
        res->type = TT_PLUS;
        res->c = c;
        return 0;
    case '*':
        res->type = TT_STAR;
        res->c = c;
        return 0;
    case '=':
        res->type = TT_EQUALS;
        res->c = c;
        return 0;
    case '#':
        if ((c = lexer_consume_single(state, true)) == '\0') {
            return -1;
        }

        if (lexer_scan_ident(state, c, res) == 0) {
            if (lexer_check_direc(state, res) != 0) {
                cc_error("bad directive '%s'\n", res->s);
                return -1;
            }
        }

        return 0;
    case '/':
        if ((c = lexer_consume_single(state, true)) == '/') {
            res->type = TT_COMMENT;
            res->c = c;
            lexer_skip_comment(state);
            return 0;
        }

        lexer_putback(state, c);
        res->type = TT_SLASH;
        return 0;
    case '-':
        res->c = c;
        if ((c = lexer_consume_single(state, true)) == '>') {
            res->type = TT_ARROW;
            return 0;
        }

        lexer_putback(state, c);
        res->type = TT_MINUS;
        return 0;
    default:
        if (lexer_scan_ident(state, c, res) == 0) {
            lexer_check_kw(state, res);
            return 0;
        }
    }

    cc_error("got unknown token '%c'\n", c);
    return -1;
}