heiytor
diff --git a/‎src/lexer/ch.c‎
Lines changed: 118 additions & 0 deletions b/‎src/lexer/ch.c‎
Lines changed: 118 additions & 0 deletions
diff --git a/‎src/lexer/ch.h‎
Lines changed: 12 additions & 0 deletions b/‎src/lexer/ch.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/lexer/def.h‎
Lines changed: 13 additions & 13 deletions b/‎src/lexer/def.h‎
Lines changed: 13 additions & 13 deletions
diff --git a/‎src/lexer/init.c‎
Lines changed: 22 additions & 19 deletions b/‎src/lexer/init.c‎
Lines changed: 22 additions & 19 deletions
diff --git a/‎src/lexer/init.h‎
Lines changed: 8 additions & 0 deletions b/‎src/lexer/init.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/lexer/lib.h‎
Lines changed: 0 additions & 16 deletions b/‎src/lexer/lib.h‎
Lines changed: 0 additions & 16 deletions
diff --git a/‎src/lexer/nav.c‎
Lines changed: 41 additions & 40 deletions b/‎src/lexer/nav.c‎
Lines changed: 41 additions & 40 deletions
@@ -0,0 +1,118 @@
+// This file provides functions for character recognition and handling.
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <lexer/ch.h>
+
+/**
+ * Determines whether a given character represents a number.
+ * 
+ * @param ch The character being evaluated.
+ * 
+ * @return 1 if the character is numeric, 0 otherwise.
+ */
+int is_numeric(chcode_t ch) {
+ return '0' <= ch && ch <= '9';
+}
+
+/**
+ * Some programming languages allow special characters for variable declaration, such as "$" in
+ * Javascript. To accommodate this, during tokenization, this character must be considered a
+ * letter just like any other [a-zA-Z]. 
+ * 
+ * During the syntax creation process, one of my main goals was to maintain readable code, for
+ * this reason, as a personal choice, I decided to make characters like "?" and "!" available
+ * for variable declaration. This happens because I believe that a function written for example
+ * as "is_char?()" can be much simpler to understand for an inexperienced person than simply
+ * "is_char".
+ * 
+ * A number of other special characters are available for the same reason (or simply to give
+ * freedom to the developer).
+ *
+ * @param ch The character being evaluated.
+ * 
+ * @return 1 if the character is a letter, 0 otherwise.
+ */
+int is_letter(chcode_t ch) {
+ return 'a' <= ch && ch <= 'z' ||
+ 'A' <= ch && ch <= 'Z' ||
+ // Add special characters above
+ ch == '_' ||
+ // ch == '-' || // uncomment to allow kebab-case
+ ch == '?' ||
+ ch == '!' ||
+ ch == '$';
+}
+
+/**
+ * Checks if the character sequence forms a compound symbol.
+ * 
+ * @param curr_char The current character being evaluated.
+ * @param next_char The character following the current character.
+ * 
+ * @return 1 if the characters form a compound symbol, 0 otherwise.
+ */
+int is_compound_symbol(chcode_t curr_char, chcode_t next_char) {
+ int actual = curr_char == '+' || curr_char == '-' || curr_char == '=' || curr_char == '!' || curr_char == '*' || curr_char == '<' || curr_char == '>';
+ int next = next_char == '+' || next_char == '-' || next_char == '=' || next_char == '!' || next_char == '*' || next_char == '<' || next_char == '>';
+
+ return actual != 0 && next != 0;
+}
+
+/**
+ * In many programming languages, numeric literals can include a sign (+ or -) to 
+ * indicate whether the number is positive or negative. However, this sign is not 
+ * considered part of the number itself; it's treated as an operator that operates 
+ * on the number.
+ *
+ * For instance, in the expression "-5", the "-" is a unary negation operator that 
+ * operates on the number "5". Similarly, in the expression "+5", the "+" is a unary 
+ * plus operator.
+ *
+ * This function, `is_number()`, is used to determine whether a given character 
+ * sequence represents a number. This includes both signed numbers, where the first 
+ * character is a sign ('-' or '+') followed by a numeric character, as well as 
+ * unsigned numbers, where the character is numeric. It returns true if the first 
+ * character is numeric or if it's a sign and the second character is numeric. 
+ * Otherwise, it returns false.
+ * 
+ * @param curr_char The current character being evaluated. This could be a numeric 
+ * character or a sign ('+' or '-').
+ * 
+ * @param next_char The character following the current character. Used for evaluating 
+ * if a sequence is representing a signed number.
+ * 
+ * @return 1 if is number, 0 otherwise.
+ */
+int is_number(chcode_t curr_char, chcode_t next_char) {
+ return (is_numeric(curr_char) || ((curr_char == '-' || curr_char == '+') && is_numeric(next_char)));
+}
+
+/**
+ * As in other languages, some special characters are allowed to create variable names,
+ * but you are not allowed to start with these characters.
+ *
+ * Let's take "!" for example:
+ * 
+ * ! is an operator character, if the lexer sees a !, it must
+ * create a specific token for it. Even so, if the language considers "!" as a letter
+ * (to create var names), so the lexer will tokenize it as a variable. Because of this,
+ * some languages must block these special characters to be the first letter of the var.
+ * 
+ * So if the lexer sees any of these special characters, it must create two tokens:
+ * !my_var: one for "!" and other for "my_var"
+ * 
+ * If the lexer sees any of these characters after a letter, the lexer must create a single token:
+ * my_!var: "my_!var"
+ * 
+ * @param ch The current character being evaluated.
+ * 
+ * @return 1 if is number, 0 otherwise.
+ */
+int is_allowed_as_first_char(chcode_t ch) {
+ return ch != '.' &&
+ // ch != '-' && // uncomment to allow kebab-case
+ ch != '?' &&
+ ch != '!';
+}
@@ -0,0 +1,12 @@
+#ifndef LEXER_CH_H
+#define LEXER_CH_H
+
+#include <langdef.h>
+
+int is_numeric(chcode_t ch);
+int is_letter(chcode_t ch);
+int is_allowed_as_first_char(chcode_t ch);
+int is_compound_symbol(chcode_t curr_char, chcode_t next_char);
+int is_number(chcode_t curr_char, chcode_t next_char);
+
+#endif /* LEXER_CH_H */
@@ -4,22 +4,22 @@
 #include <langdef.h>
 #include <token/def.h>
 
+typedef struct Lexer Lexer;
 struct Lexer {
- char* input;
- int input_length;
- int position;
- int read_position;
- byte ch;
- int line;
- int column;
+ char*  input;
+ int  input_length;
+ chcode_t ch;
+ int  position;
+ int read_position;
+ int  line;
+ int  column;
 
- void (*consume_char)(struct Lexer *self);
- char *(*read_sequence)(struct Lexer *self);
- void (*jump_whitespace)(struct Lexer *self);
- byte (*peek_prev_char)(struct Lexer *self);
- byte (*peek_next_char)(struct Lexer *self);
+ void (*consume_char)(Lexer *self);
+ char *(*read_sequence)(Lexer *self);
+ void (*jump_whitespace)(Lexer *self);
+ chcode_t (*peek_next_char)(Lexer *self);
 
- struct Token *(*consume_token)(struct Lexer *lex);
+ Token  *(*consume_token)(Lexer *lex);
 };
 
 #endif /* LEXER_DEF_H */
@@ -1,7 +1,11 @@
-#include <string.h>
+// This file includes the function for creating a new lexer object and initializing the lexer.
+
 #include <stdlib.h>
+#include <string.h>
 
-#include <lexer/lib.h>
+#include <lexer/init.h>
+#include <lexer/nav.h>
+#include <lexer/tokenization.h>
 
 /**
  * Creates a new lexer object for tokenizing the input string.
@@ -10,25 +14,24 @@
  *
  * @return A pointer to the newly created lexer object
  */
-struct Lexer* new_lexer(char* input) {
- struct Lexer *lex = malloc(sizeof(struct Lexer));
+Lexer* new_lexer(char* input) {
+ Lexer *l = malloc(sizeof(Lexer));
 
- lex->input = input;
- lex->position = 0;
- lex->read_position = 0;
- lex->ch = 0;
- lex->input_length = strlen(input);
- lex->line = 1;
- lex->column = 0;
+ l->input = input;
+ l->position = 0;
+ l->read_position = 0;
+ l->ch = 0;
+ l->input_length = strlen(input);
+ l->line = 1;
+ l->column = 0;
 
- lex->consume_char = __LEXER_consume_char;
- lex->jump_whitespace = __LEXER_jump_whitespace;
- lex->peek_next_char = __LEXER_peek_next_char;
- lex->peek_prev_char = __LEXER_peek_prev_char;
- lex->read_sequence = __LEXER_read_sequence;
- lex->consume_token = __LEXER_consume_token;
+ l->consume_char = __LEXER_consume_char;
+ l->jump_whitespace = __LEXER_jump_whitespace;
+ l->peek_next_char = __LEXER_peek_next_char;
+ l->read_sequence = __LEXER_read_sequence;
+ l->consume_token = __LEXER_consume_token;
 
- lex->consume_char(lex);
+ l->consume_char(l);
 
- return lex;
+ return l;
 }
@@ -0,0 +1,8 @@
+#ifndef LEXER_INIT_H
+#define LEXER_INIT_H
+
+#include <lexer/def.h>
+
+Lexer *new_lexer(char* input);
+
+#endif /* LEXER_INIT_H */
@@ -1,78 +1,78 @@
-#include <string.h>
-#include <stdlib.h>
+// This file contains navigation functions for the lexer.
 
-#include <utils/chardef.h>
+#include <stdlib.h>
+#include <string.h>
 
-#include <lexer/lib.h>
+#include <lexer/ch.h>
+#include <lexer/nav.h>
 
 /**
  * Advances the lexer to the next character in the input stream.
  * 
  * This function will also update current line, and column information, 
- * and detects the end of the lexer input stream (EOF)
+ * and detects the end of the l input stream (EOF)
  * 
- * @param lex A pointer to a lexer object
+ * @param self A pointer to a lexer object
  * 
  * @return void
  */
-void __LEXER_consume_char(struct Lexer* lex) {
- if (lex->read_position >= lex->input_length) {
- lex->ch = '\0';
+void __LEXER_consume_char(Lexer *self) {
+ if (self->read_position >= self->input_length) {
+ self->ch = '\0';
  }
 
- lex->ch = lex->input[lex->read_position];
+ self->ch = self->input[self->read_position];
 
- if (lex->ch == '\n') {
- lex->column = 1;
- ++lex->line;
+ if (self->ch == '\n') {
+ self->column = 1;
+ ++self->line;
  } else {
- ++lex->column;
+ ++self->column;
  }
 
- lex->position = lex->read_position;
- ++lex->read_position;
+ self->position = self->read_position;
+ ++self->read_position;
 }
 
-byte __LEXER_peek_prev_char(struct Lexer* lexer) {
- if (lexer->read_position >= lexer->input_length) {
- return 0;
- }
- 
- return lexer->input[lexer->read_position - 2];
-}
-
-byte __LEXER_peek_next_char(struct Lexer* lexer) {
- if (lexer->read_position >= lexer->input_length) {
+/**
+ * Read the next char from the lexer input.
+ * 
+ * @param self A pointer to a lexer object
+ * 
+ * @return The next char.
+ */
+chcode_t __LEXER_peek_next_char(Lexer *self) {
+ if (self->read_position >= self->input_length) {
  return 0;
  }
 
- return lexer->input[lexer->read_position];
+ return self->input[self->read_position];
 }
 
 /**
  * Reads a sequence of characters from the lexer input based on the specified type.
  * 
- * @param lex A pointer to a lexer object
+ * @param self A pointer to a lexer object
  * 
  * @return A dynamically allocated string containing the read sequence, or NULL if there was an error.
  */
-char* __LEXER_read_sequence(struct Lexer* lexer) {
- int position = lexer->position;
+char* __LEXER_read_sequence(Lexer *self) {
+ int position = self->position;
 
- while ((is_letter(lexer->ch) || is_numeric(lexer->ch) || lexer->ch == '.') ||
+ while ((is_letter(self->ch) || is_numeric(self->ch) || self->ch == '.') ||
  // signed numbers will match with this condition
- ((lexer->ch == '-' || lexer->ch == '+') && is_numeric(__LEXER_peek_next_char(lexer)))) {
- __LEXER_consume_char(lexer);
+ ((self->ch == '-' || self->ch == '+') && is_numeric(__LEXER_peek_next_char(self)))) {
+ __LEXER_consume_char(self);
  }
 
- int length = lexer->position - position;
+ int length = self->position - position;
 
  char* result = malloc(length + 1);
  if (result == NULL) {
  return NULL;
  }
 
- memcpy(result, lexer->input + position, length + 1);
+ memcpy(result, self->input + position, length + 1);
 
  result[length] = '\0';
 
@@ -86,12 +86,13 @@ char* __LEXER_read_sequence(struct Lexer* lexer) {
 /**
  * Skips over whitespace characters in the lexer input stream.
  *
- * @param lex A pointer to a lexer objec
+ * @param self A pointer to a lexer objec
  * 
  * @return void
  */
-void __LEXER_jump_whitespace(struct Lexer* lexer) {
- while (lexer->ch == ' ' || lexer->ch == '\t' || lexer->ch == '\n' || lexer->ch == '\r') {
- __LEXER_consume_char(lexer);
+void __LEXER_jump_whitespace(Lexer *self) {
+ while (self->ch == ' ' || self->ch == '\t' || self->ch == '\n' || self->ch == '\r') {
+ __LEXER_consume_char(self);
  }
-}
+}
+