Skip to content

Commit e5dbf1d

Browse files
author
Heitor Danilo
committed
refactor lexer
1 parent e4a11c4 commit e5dbf1d

File tree

6 files changed

+152
-73
lines changed

6 files changed

+152
-73
lines changed

src/lexer/actions.h

Lines changed: 0 additions & 28 deletions
This file was deleted.

src/lexer/def.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#ifndef LEXER_DEF_H
2+
#define LEXER_DEF_H
3+
4+
#include <langdef.h>
5+
#include <token/def.h>
6+
7+
struct Lexer {
8+
char* input;
9+
int input_length;
10+
int position;
11+
int read_position;
12+
byte ch;
13+
int line;
14+
int column;
15+
16+
void (*next_char)(struct Lexer *self);
17+
char *(*read_sequence)(struct Lexer *self);
18+
void (*jump_whitespace)(struct Lexer *self);
19+
byte (*peek_prev_char)(struct Lexer *self);
20+
byte (*peek_next_char)(struct Lexer *self);
21+
22+
struct Token *(*next_token)(struct Lexer *lex);
23+
};
24+
25+
#endif /* LEXER_DEF_H */

src/lexer/init.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
#include <string.h>
2+
#include <stdlib.h>
3+
4+
#include <lexer/lib.h>
5+
6+
/**
7+
* Creates a new lexer object for tokenizing the input string.
8+
*
9+
* @param input A pointer to the input string to be tokenized
10+
*
11+
* @return A pointer to the newly created lexer object
12+
*/
13+
struct Lexer* new_lexer(char* input) {
14+
struct Lexer *lex = malloc(sizeof(struct Lexer));
15+
16+
lex->input = input;
17+
lex->position = 0;
18+
lex->read_position = 0;
19+
lex->ch = 0;
20+
lex->input_length = strlen(input);
21+
lex->line = 1;
22+
lex->column = 0;
23+
24+
lex->next_char = __next_char;
25+
lex->jump_whitespace = __jump_whitespace;
26+
lex->peek_next_char = __peek_next_char;
27+
lex->peek_prev_char = __peek_prev_char;
28+
lex->read_sequence = __read_sequence;
29+
lex->next_token = __next_token;
30+
31+
lex->next_char(lex);
32+
33+
return lex;
34+
}

src/lexer/lib.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#ifndef LEXER_LIB_H
2+
#define LEXER_LIB_H
3+
4+
#include <lexer/def.h>
5+
#include <token/def.h>
6+
7+
struct Lexer* new_lexer(char* input);
8+
9+
void __next_char(struct Lexer* l);
10+
char* __read_sequence(struct Lexer* l);
11+
void __jump_whitespace(struct Lexer* l);
12+
byte __peek_prev_char(struct Lexer *l);
13+
byte __peek_next_char(struct Lexer *l);
14+
struct Token *__next_token(struct Lexer *l);
15+
16+
#endif /* LEXER_LIB_H */
Lines changed: 24 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,9 @@
11
#include <string.h>
2-
#include <stdio.h>
32
#include <stdlib.h>
43

5-
#include <helpers/characters.h>
4+
#include <utils/chardef.h>
65

7-
#include <lexer/actions.h>
8-
9-
/**
10-
* Creates a new lexer object for tokenizing the input string.
11-
*
12-
* @param input A pointer to the input string to be tokenized
13-
*
14-
* @return A pointer to the newly created lexer object
15-
*/
16-
struct Lexer* new_lexer(char* input) {
17-
struct Lexer *lex = malloc(sizeof(struct Lexer));
18-
19-
lex->input = input;
20-
lex->position = 0;
21-
lex->read_position = 0;
22-
lex->ch = 0;
23-
lex->input_length = strlen(input);
24-
lex->line = 1;
25-
lex->column = 0;
26-
27-
next_char(lex);
28-
29-
return lex;
30-
}
6+
#include <lexer/lib.h>
317

328
/**
339
* Advances the lexer to the next character in the input stream.
@@ -39,7 +15,7 @@ struct Lexer* new_lexer(char* input) {
3915
*
4016
* @return void
4117
*/
42-
void next_char(struct Lexer* lex) {
18+
void __next_char(struct Lexer* lex) {
4319
if (lex->read_position >= lex->input_length) {
4420
lex->ch = '\0';
4521
}
@@ -57,7 +33,15 @@ void next_char(struct Lexer* lex) {
5733
++lex->read_position;
5834
}
5935

60-
byte peek_next_char(struct Lexer *lexer) {
36+
byte __peek_prev_char(struct Lexer *lexer) {
37+
if (lexer->read_position >= lexer->input_length) {
38+
return 0;
39+
}
40+
41+
return lexer->input[lexer->read_position - 2];
42+
}
43+
44+
byte __peek_next_char(struct Lexer *lexer) {
6145
if (lexer->read_position >= lexer->input_length) {
6246
return 0;
6347
}
@@ -72,11 +56,13 @@ byte peek_next_char(struct Lexer *lexer) {
7256
*
7357
* @return A dynamically allocated string containing the read sequence, or NULL if there was an error.
7458
*/
75-
char* read_sequence(struct Lexer *lex) {
59+
char* __read_sequence(struct Lexer *lex) {
7660
int position = lex->position;
7761

78-
while (is_letter(lex->ch) || is_numeric(lex->ch)) {
79-
next_char(lex);
62+
while ((is_letter(lex->ch) || is_numeric(lex->ch)) ||
63+
// signed numbers will match with this condition
64+
(lex->ch == '-' && is_numeric(__peek_next_char(lex)))) {
65+
__next_char(lex);
8066
}
8167

8268
int length = lex->position - position;
@@ -87,32 +73,25 @@ char* read_sequence(struct Lexer *lex) {
8773
}
8874

8975
memcpy(result, lex->input + position, length + 1);
76+
77+
result[length] = '\0';
9078

91-
// The parser will throw an error if token->literal contains both letters and digits
92-
if (is_letter(result[length]) == false && is_numeric(result[length]) == false) {
93-
// result == `[x-byte]\0\0`
94-
result[length] = '\0';
95-
// Remove the last null terminator
96-
result = realloc(result, length);
97-
98-
if (result == NULL) {
99-
return NULL;
100-
}
101-
}
79+
// if (!is_valid_char_in_sequence(result[length])) {
80+
// result[length] = '\0';
81+
// }
10282

10383
return result;
10484
}
10585

106-
10786
/**
10887
* Skips over whitespace characters in the lexer input stream.
10988
*
11089
* @param lex A pointer to a lexer objec
11190
*
11291
* @return void
11392
*/
114-
void jump_whitespace(struct Lexer* lex) {
93+
void __jump_whitespace(struct Lexer* lex) {
11594
while (lex->ch == ' ' || lex->ch == '\t' || lex->ch == '\n' || lex->ch == '\r') {
116-
next_char(lex);
95+
__next_char(lex);
11796
}
11897
}

src/lexer/tokenization.c

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#include <string.h>
2+
3+
#include <utils/chardef.h>
4+
5+
#include <lexer/lib.h>
6+
#include <token/lib.h>
7+
8+
/**
9+
* Gets the next token from the lexer.
10+
*
11+
* @param lex The lexer.
12+
*
13+
* @return A pointer to the next token.
14+
*/
15+
struct Token *__next_token(struct Lexer *lex) {
16+
lex->jump_whitespace(lex);
17+
18+
bool is_char_allowed_for_start = is_allowed_as_first_char(lex->ch);
19+
byte next_literal = lex->peek_next_char(lex);
20+
21+
// e.g. 312 | 312542.423
22+
if ((is_numeric(lex->ch) && is_char_allowed_for_start) ||
23+
// e.g my_var | my_var1
24+
(is_letter(lex->ch) && is_char_allowed_for_start) ||
25+
// e.g +312 | -312542.423
26+
(is_signed_number(lex->ch, next_literal))) {
27+
char *curr_literal = lex->read_sequence(lex);
28+
uint8 code = is_numeric(lex->ch) ? NUMBER : get_ident_code(curr_literal);
29+
30+
// get correct column value
31+
int column = lex->column;
32+
if (column != 1) {
33+
column -= strlen(curr_literal);
34+
}
35+
36+
return new_token(curr_literal, code, lex->line, column);
37+
}
38+
39+
char curr_literal[3] = {lex->ch, '\0', '\0'};
40+
int column = lex->column;
41+
42+
if (is_compound_symbol(curr_literal[0], next_literal)) {
43+
curr_literal[1] = next_literal;
44+
lex->next_char(lex);
45+
}
46+
47+
uint8 code = get_symbol_code(curr_literal);
48+
struct Token *tok = new_token(curr_literal, code, lex->line, column);
49+
50+
lex->next_char(lex);
51+
52+
return tok;
53+
}

0 commit comments

Comments
 (0)