Skip to content

Commit 35c7dbb

Browse files
author
Heitor Danilo
committed
chore: file tree
1 parent fb04026 commit 35c7dbb

File tree

10 files changed

+297
-125
lines changed

10 files changed

+297
-125
lines changed

src/lexer/ch.c

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
// This file provides functions for character recognition and handling.
2+
3+
#include <stdlib.h>
4+
#include <string.h>
5+
6+
#include <lexer/ch.h>
7+
8+
/**
9+
* Determines whether a given character represents a number.
10+
*
11+
* @param ch The character being evaluated.
12+
*
13+
* @return 1 if the character is numeric, 0 otherwise.
14+
*/
15+
int is_numeric(chcode_t ch) {
16+
return '0' <= ch && ch <= '9';
17+
}
18+
19+
/**
20+
* Some programming languages allow special characters for variable declaration, such as "$" in
21+
* Javascript. To accommodate this, during tokenization, this character must be considered a
22+
* letter just like any other [a-zA-Z].
23+
*
24+
* During the syntax creation process, one of my main goals was to maintain readable code, for
25+
* this reason, as a personal choice, I decided to make characters like "?" and "!" available
26+
* for variable declaration. This happens because I believe that a function written for example
27+
* as "is_char?()" can be much simpler to understand for an inexperienced person than simply
28+
* "is_char".
29+
*
30+
* A number of other special characters are available for the same reason (or simply to give
31+
* freedom to the developer).
32+
*
33+
* @param ch The character being evaluated.
34+
*
35+
* @return 1 if the character is a letter, 0 otherwise.
36+
*/
37+
int is_letter(chcode_t ch) {
38+
return 'a' <= ch && ch <= 'z' ||
39+
'A' <= ch && ch <= 'Z' ||
40+
// Add special characters above
41+
ch == '_' ||
42+
// ch == '-' || // uncomment to allow kebab-case
43+
ch == '?' ||
44+
ch == '!' ||
45+
ch == '$';
46+
}
47+
48+
/**
49+
* Checks if the character sequence forms a compound symbol.
50+
*
51+
* @param curr_char The current character being evaluated.
52+
* @param next_char The character following the current character.
53+
*
54+
* @return 1 if the characters form a compound symbol, 0 otherwise.
55+
*/
56+
int is_compound_symbol(chcode_t curr_char, chcode_t next_char) {
57+
int actual = curr_char == '+' || curr_char == '-' || curr_char == '=' || curr_char == '!' || curr_char == '*' || curr_char == '<' || curr_char == '>';
58+
int next = next_char == '+' || next_char == '-' || next_char == '=' || next_char == '!' || next_char == '*' || next_char == '<' || next_char == '>';
59+
60+
return actual != 0 && next != 0;
61+
}
62+
63+
/**
64+
* In many programming languages, numeric literals can include a sign (+ or -) to
65+
* indicate whether the number is positive or negative. However, this sign is not
66+
* considered part of the number itself; it's treated as an operator that operates
67+
* on the number.
68+
*
69+
* For instance, in the expression "-5", the "-" is a unary negation operator that
70+
* operates on the number "5". Similarly, in the expression "+5", the "+" is a unary
71+
* plus operator.
72+
*
73+
* This function, `is_number()`, is used to determine whether a given character
74+
* sequence represents a number. This includes both signed numbers, where the first
75+
* character is a sign ('-' or '+') followed by a numeric character, as well as
76+
* unsigned numbers, where the character is numeric. It returns true if the first
77+
* character is numeric or if it's a sign and the second character is numeric.
78+
* Otherwise, it returns false.
79+
*
80+
* @param curr_char The current character being evaluated. This could be a numeric
81+
* character or a sign ('+' or '-').
82+
*
83+
* @param next_char The character following the current character. Used for evaluating
84+
* if a sequence is representing a signed number.
85+
*
86+
* @return 1 if is number, 0 otherwise.
87+
*/
88+
int is_number(chcode_t curr_char, chcode_t next_char) {
89+
return (is_numeric(curr_char) || ((curr_char == '-' || curr_char == '+') && is_numeric(next_char)));
90+
}
91+
92+
/**
93+
* As in other languages, some special characters are allowed to create variable names,
94+
* but you are not allowed to start with these characters.
95+
*
96+
* Let's take "!" for example:
97+
*
98+
* ! is an operator character, if the lexer sees a !, it must
99+
* create a specific token for it. Even so, if the language considers "!" as a letter
100+
* (to create var names), so the lexer will tokenize it as a variable. Because of this,
101+
* some languages must block these special characters to be the first letter of the var.
102+
*
103+
* So if the lexer sees any of these special characters, it must create two tokens:
104+
* !my_var: one for "!" and other for "my_var"
105+
*
106+
* If the lexer sees any of these characters after a letter, the lexer must create a single token:
107+
* my_!var: "my_!var"
108+
*
109+
* @param ch The current character being evaluated.
110+
*
111+
* @return 1 if is number, 0 otherwise.
112+
*/
113+
int is_allowed_as_first_char(chcode_t ch) {
114+
return ch != '.' &&
115+
// ch != '-' && // uncomment to allow kebab-case
116+
ch != '?' &&
117+
ch != '!';
118+
}

src/lexer/ch.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#ifndef LEXER_CH_H
2+
#define LEXER_CH_H
3+
4+
#include <langdef.h>
5+
6+
int is_numeric(chcode_t ch);
7+
int is_letter(chcode_t ch);
8+
int is_allowed_as_first_char(chcode_t ch);
9+
int is_compound_symbol(chcode_t curr_char, chcode_t next_char);
10+
int is_number(chcode_t curr_char, chcode_t next_char);
11+
12+
#endif /* LEXER_CH_H */

src/lexer/def.h

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,22 @@
44
#include <langdef.h>
55
#include <token/def.h>
66

7+
typedef struct Lexer Lexer;
78
struct Lexer {
8-
char* input;
9-
int input_length;
10-
int position;
11-
int read_position;
12-
byte ch;
13-
int line;
14-
int column;
9+
char* input;
10+
int input_length;
11+
chcode_t ch;
12+
int position;
13+
int read_position;
14+
int line;
15+
int column;
1516

16-
void (*consume_char)(struct Lexer *self);
17-
char *(*read_sequence)(struct Lexer *self);
18-
void (*jump_whitespace)(struct Lexer *self);
19-
byte (*peek_prev_char)(struct Lexer *self);
20-
byte (*peek_next_char)(struct Lexer *self);
17+
void (*consume_char)(Lexer *self);
18+
char *(*read_sequence)(Lexer *self);
19+
void (*jump_whitespace)(Lexer *self);
20+
chcode_t (*peek_next_char)(Lexer *self);
2121

22-
struct Token *(*consume_token)(struct Lexer *lex);
22+
Token *(*consume_token)(Lexer *lex);
2323
};
2424

2525
#endif /* LEXER_DEF_H */

src/lexer/init.c

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
1-
#include <string.h>
1+
// This file includes the function for creating a new lexer object and initializing the lexer.
2+
23
#include <stdlib.h>
4+
#include <string.h>
35

4-
#include <lexer/lib.h>
6+
#include <lexer/init.h>
7+
#include <lexer/nav.h>
8+
#include <lexer/tokenization.h>
59

610
/**
711
* Creates a new lexer object for tokenizing the input string.
@@ -10,25 +14,24 @@
1014
*
1115
* @return A pointer to the newly created lexer object
1216
*/
13-
struct Lexer* new_lexer(char* input) {
14-
struct Lexer *lex = malloc(sizeof(struct Lexer));
17+
Lexer* new_lexer(char* input) {
18+
Lexer *l = malloc(sizeof(Lexer));
1519

16-
lex->input = input;
17-
lex->position = 0;
18-
lex->read_position = 0;
19-
lex->ch = 0;
20-
lex->input_length = strlen(input);
21-
lex->line = 1;
22-
lex->column = 0;
20+
l->input = input;
21+
l->position = 0;
22+
l->read_position = 0;
23+
l->ch = 0;
24+
l->input_length = strlen(input);
25+
l->line = 1;
26+
l->column = 0;
2327

24-
lex->consume_char = __LEXER_consume_char;
25-
lex->jump_whitespace = __LEXER_jump_whitespace;
26-
lex->peek_next_char = __LEXER_peek_next_char;
27-
lex->peek_prev_char = __LEXER_peek_prev_char;
28-
lex->read_sequence = __LEXER_read_sequence;
29-
lex->consume_token = __LEXER_consume_token;
28+
l->consume_char = __LEXER_consume_char;
29+
l->jump_whitespace = __LEXER_jump_whitespace;
30+
l->peek_next_char = __LEXER_peek_next_char;
31+
l->read_sequence = __LEXER_read_sequence;
32+
l->consume_token = __LEXER_consume_token;
3033

31-
lex->consume_char(lex);
34+
l->consume_char(l);
3235

33-
return lex;
36+
return l;
3437
}

src/lexer/init.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#ifndef LEXER_INIT_H
2+
#define LEXER_INIT_H
3+
4+
#include <lexer/def.h>
5+
6+
Lexer *new_lexer(char* input);
7+
8+
#endif /* LEXER_INIT_H */

src/lexer/lib.h

Lines changed: 0 additions & 16 deletions
This file was deleted.

src/lexer/nav.c

Lines changed: 41 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,78 +1,78 @@
1-
#include <string.h>
2-
#include <stdlib.h>
1+
// This file contains navigation functions for the lexer.
32

4-
#include <utils/chardef.h>
3+
#include <stdlib.h>
4+
#include <string.h>
55

6-
#include <lexer/lib.h>
6+
#include <lexer/ch.h>
7+
#include <lexer/nav.h>
78

89
/**
910
* Advances the lexer to the next character in the input stream.
1011
*
1112
* This function will also update current line, and column information,
12-
* and detects the end of the lexer input stream (EOF)
13+
* and detects the end of the l input stream (EOF)
1314
*
14-
* @param lex A pointer to a lexer object
15+
* @param self A pointer to a lexer object
1516
*
1617
* @return void
1718
*/
18-
void __LEXER_consume_char(struct Lexer* lex) {
19-
if (lex->read_position >= lex->input_length) {
20-
lex->ch = '\0';
19+
void __LEXER_consume_char(Lexer *self) {
20+
if (self->read_position >= self->input_length) {
21+
self->ch = '\0';
2122
}
2223

23-
lex->ch = lex->input[lex->read_position];
24+
self->ch = self->input[self->read_position];
2425

25-
if (lex->ch == '\n') {
26-
lex->column = 1;
27-
++lex->line;
26+
if (self->ch == '\n') {
27+
self->column = 1;
28+
++self->line;
2829
} else {
29-
++lex->column;
30+
++self->column;
3031
}
3132

32-
lex->position = lex->read_position;
33-
++lex->read_position;
33+
self->position = self->read_position;
34+
++self->read_position;
3435
}
3536

36-
byte __LEXER_peek_prev_char(struct Lexer* lexer) {
37-
if (lexer->read_position >= lexer->input_length) {
38-
return 0;
39-
}
40-
41-
return lexer->input[lexer->read_position - 2];
42-
}
43-
44-
byte __LEXER_peek_next_char(struct Lexer* lexer) {
45-
if (lexer->read_position >= lexer->input_length) {
37+
/**
38+
* Read the next char from the lexer input.
39+
*
40+
* @param self A pointer to a lexer object
41+
*
42+
* @return The next char.
43+
*/
44+
chcode_t __LEXER_peek_next_char(Lexer *self) {
45+
if (self->read_position >= self->input_length) {
4646
return 0;
4747
}
4848

49-
return lexer->input[lexer->read_position];
49+
return self->input[self->read_position];
5050
}
5151

5252
/**
5353
* Reads a sequence of characters from the lexer input based on the specified type.
5454
*
55-
* @param lex A pointer to a lexer object
55+
* @param self A pointer to a lexer object
5656
*
5757
* @return A dynamically allocated string containing the read sequence, or NULL if there was an error.
5858
*/
59-
char* __LEXER_read_sequence(struct Lexer* lexer) {
60-
int position = lexer->position;
59+
char* __LEXER_read_sequence(Lexer *self) {
60+
int position = self->position;
6161

62-
while ((is_letter(lexer->ch) || is_numeric(lexer->ch) || lexer->ch == '.') ||
62+
while ((is_letter(self->ch) || is_numeric(self->ch) || self->ch == '.') ||
6363
// signed numbers will match with this condition
64-
((lexer->ch == '-' || lexer->ch == '+') && is_numeric(__LEXER_peek_next_char(lexer)))) {
65-
__LEXER_consume_char(lexer);
64+
((self->ch == '-' || self->ch == '+') && is_numeric(__LEXER_peek_next_char(self)))) {
65+
__LEXER_consume_char(self);
6666
}
6767

68-
int length = lexer->position - position;
68+
int length = self->position - position;
6969

7070
char* result = malloc(length + 1);
7171
if (result == NULL) {
7272
return NULL;
7373
}
7474

75-
memcpy(result, lexer->input + position, length + 1);
75+
memcpy(result, self->input + position, length + 1);
7676

7777
result[length] = '\0';
7878

@@ -86,12 +86,13 @@ char* __LEXER_read_sequence(struct Lexer* lexer) {
8686
/**
8787
* Skips over whitespace characters in the lexer input stream.
8888
*
89-
* @param lex A pointer to a lexer objec
89+
* @param self A pointer to a lexer objec
9090
*
9191
* @return void
9292
*/
93-
void __LEXER_jump_whitespace(struct Lexer* lexer) {
94-
while (lexer->ch == ' ' || lexer->ch == '\t' || lexer->ch == '\n' || lexer->ch == '\r') {
95-
__LEXER_consume_char(lexer);
93+
void __LEXER_jump_whitespace(Lexer *self) {
94+
while (self->ch == ' ' || self->ch == '\t' || self->ch == '\n' || self->ch == '\r') {
95+
__LEXER_consume_char(self);
9696
}
97-
}
97+
}
98+

0 commit comments

Comments
 (0)