Files
2026-05-23 09:01:18 -04:00

363 lines
6.8 KiB
C

/*
* Copyright (c) 2026, Chloe M.
* Provided under the BSD-3 clause
*/
#include <errno.h>
#include <stdbool.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include "cescal/lexer.h"
#include "cescal/log.h"
#include "cescal/ptrbox.h"
static inline void
lexer_putback(struct cescal_state *state, char c)
{
if (state == NULL) {
return;
}
state->lex_putback = c;
}
static inline char
lexer_putback_pop(struct cescal_state *state)
{
char retc;
if (state == NULL) {
return '\0';
}
retc = state->lex_putback;
state->lex_putback = '\0';
return retc;
}
/*
* Returns true if the given character is a whitespace
*
* @c: Character to check
*/
static inline bool
lexer_is_ws(char c)
{
switch (c) {
case '\t':
case '\n':
case ' ':
case '\f':
case '\r':
return true;
}
return false;
}
/*
* Consume a single character from the input source file and
* optionally skip whitespace
*
* @state: Compiler state
* @skip_ws: If true skip whitespace
*/
static char
lexer_consume_single(struct cescal_state *state, bool skip_ws)
{
char c;
if (state == NULL) {
return '\0';
}
if ((c = lexer_putback_pop(state)) != '\0') {
if (skip_ws && !lexer_is_ws(c)) {
return c;
}
if (!skip_ws && lexer_is_ws(c)) {
return c;
}
}
while ((c = readbuf_read(&state->rb, state->in_fd)) != '\0') {
if (lexer_is_ws(c) && skip_ws) {
continue;
}
return c;
}
return '\0';
}
static int
lexer_scan_ident(struct cescal_state *state, char lc, struct token *res)
{
char *buf, c;
size_t bufsz, bufcap;
if (state == NULL || res == NULL) {
errno = EINVAL;
return -1;
}
bufsz = 0;
bufcap = 8;
if ((buf = malloc(bufcap)) == NULL) {
return -1;
}
if (lc != '_' && !isalpha(lc)) {
cc_error("bad identifier\n");
return -1;
}
buf[bufsz++] = lc;
for (;;) {
c = lexer_consume_single(state, false);
if (c != '_' && !isalnum(c)) {
lexer_putback(state, c);
buf[bufsz] = '\0';
break;
}
buf[bufsz++] = c;
if (bufsz >= bufcap) {
bufcap += 8;
buf = realloc(buf, bufcap);
}
if (buf == NULL) {
return -1;
}
}
res->s = ptrbox_strdup(&state->ptrbox, buf);
res->type = TT_IDENT;
free(buf);
return 0;
}
/*
* Checks if an identifier token is actually a keyword
*
* @state: Compiler state
* @res: Token result
*/
static void
lexer_check_kw(struct cescal_state *state, struct token *res)
{
if (state == NULL || res == NULL) {
return;
}
switch (*res->s) {
case 'p':
if (strcmp(res->s, "pub") == 0) {
res->type = TT_PUB;
return;
}
if (strcmp(res->s, "proc") == 0) {
res->type = TT_PROC;
return;
}
break;
case 'b':
if (strcmp(res->s, "begin") == 0) {
res->type = TT_BEGIN;
return;
}
break;
case 'e':
if (strcmp(res->s, "end") == 0) {
res->type = TT_END;
return;
}
break;
case 'r':
if (strcmp(res->s, "return") == 0) {
res->type = TT_RETURN;
return;
}
break;
case 'u':
if (strcmp(res->s, "u8") == 0) {
res->type = TT_U8;
return;
}
if (strcmp(res->s, "u16") == 0) {
res->type = TT_U16;
return;
}
if (strcmp(res->s, "u32") == 0) {
res->type = TT_U32;
return;
}
if (strcmp(res->s, "u64") == 0) {
res->type = TT_U64;
return;
}
break;
}
}
/*
* Check if an identifier token is actually a directive
*
* @state: Compiler state
* @res: Token result is written here
*/
static int
lexer_check_direc(struct cescal_state *state, struct token *res)
{
if (state == NULL || res == NULL) {
errno = EINVAL;
return -1;
}
switch (*res->s) {
case 'd':
if (strcmp(res->s, "define") == 0) {
res->type = TT_DEFINE;
return 0;
}
break;
case 'i':
if (strcmp(res->s, "ifndef") == 0) {
res->type = TT_IFNDEF;
return 0;
}
if (strcmp(res->s, "ifdef") == 0) {
res->type = TT_IFDEF;
return 0;
}
break;
}
return -1;
}
/*
* Skip anything after a comment
*
* @state: Compiler state
*/
static void
lexer_skip_comment(struct cescal_state *state)
{
char c;
if (state == NULL) {
return;
}
while ((c = lexer_consume_single(state, false)) != '\n') {
if (c == '\0') {
break;
}
}
}
int
lexer_nom(struct cescal_state *state, struct token *res)
{
char c;
if (state == NULL || res == NULL) {
errno = EINVAL;
return -1;
}
if ((c = lexer_consume_single(state, true)) == '\0') {
return -1;
}
switch (c) {
case '(':
res->type = TT_LPAREN;
res->c = c;
return 0;
case ')':
res->type = TT_RPAREN;
res->c = c;
return 0;
case ',':
res->type = TT_COMMA;
res->c = c;
return 0;
case ':':
res->type = TT_COLON;
res->c = c;
return 0;
case '+':
res->type = TT_PLUS;
res->c = c;
return 0;
case '*':
res->type = TT_STAR;
res->c = c;
return 0;
case '=':
res->type = TT_EQUALS;
res->c = c;
return 0;
case '#':
if ((c = lexer_consume_single(state, true)) == '\0') {
return -1;
}
if (lexer_scan_ident(state, c, res) == 0) {
if (lexer_check_direc(state, res) != 0) {
cc_error("bad directive '%s'\n", res->s);
return -1;
}
}
return 0;
case '/':
if ((c = lexer_consume_single(state, true)) == '/') {
res->type = TT_COMMENT;
res->c = c;
lexer_skip_comment(state);
return 0;
}
lexer_putback(state, c);
res->type = TT_SLASH;
return 0;
case '-':
res->c = c;
if ((c = lexer_consume_single(state, true)) == '>') {
res->type = TT_ARROW;
return 0;
}
lexer_putback(state, c);
res->type = TT_MINUS;
return 0;
default:
if (lexer_scan_ident(state, c, res) == 0) {
lexer_check_kw(state, res);
return 0;
}
}
cc_error("got unknown token '%c'\n", c);
return -1;
}