1+ // This file provides functions for character recognition and handling.
2+
3+ #include <stdlib.h>
4+ #include <string.h>
5+
6+ #include <lexer/ch.h>
7+
8+ /**
9+ * Determines whether a given character represents a number.
10+ *
11+ * @param ch The character being evaluated.
12+ *
13+ * @return 1 if the character is numeric, 0 otherwise.
14+ */
15+ int is_numeric (chcode_t ch ) {
16+ return '0' <= ch && ch <= '9' ;
17+ }
18+
19+ /**
20+ * Some programming languages allow special characters for variable declaration, such as "$" in
21+ * Javascript. To accommodate this, during tokenization, this character must be considered a
22+ * letter just like any other [a-zA-Z].
23+ *
24+ * During the syntax creation process, one of my main goals was to maintain readable code, for
25+ * this reason, as a personal choice, I decided to make characters like "?" and "!" available
26+ * for variable declaration. This happens because I believe that a function written for example
27+ * as "is_char?()" can be much simpler to understand for an inexperienced person than simply
28+ * "is_char".
29+ *
30+ * A number of other special characters are available for the same reason (or simply to give
31+ * freedom to the developer).
32+ *
33+ * @param ch The character being evaluated.
34+ *
35+ * @return 1 if the character is a letter, 0 otherwise.
36+ */
37+ int is_letter (chcode_t ch ) {
38+ return 'a' <= ch && ch <= 'z' ||
39+ 'A' <= ch && ch <= 'Z' ||
40+ // Add special characters above
41+ ch == '_' ||
42+ // ch == '-' || // uncomment to allow kebab-case
43+ ch == '?' ||
44+ ch == '!' ||
45+ ch == '$' ;
46+ }
47+
48+ /**
49+ * Checks if the character sequence forms a compound symbol.
50+ *
51+ * @param curr_char The current character being evaluated.
52+ * @param next_char The character following the current character.
53+ *
54+ * @return 1 if the characters form a compound symbol, 0 otherwise.
55+ */
56+ int is_compound_symbol (chcode_t curr_char , chcode_t next_char ) {
57+ int actual = curr_char == '+' || curr_char == '-' || curr_char == '=' || curr_char == '!' || curr_char == '*' || curr_char == '<' || curr_char == '>' ;
58+ int next = next_char == '+' || next_char == '-' || next_char == '=' || next_char == '!' || next_char == '*' || next_char == '<' || next_char == '>' ;
59+
60+ return actual != 0 && next != 0 ;
61+ }
62+
63+ /**
64+ * In many programming languages, numeric literals can include a sign (+ or -) to
65+ * indicate whether the number is positive or negative. However, this sign is not
66+ * considered part of the number itself; it's treated as an operator that operates
67+ * on the number.
68+ *
69+ * For instance, in the expression "-5", the "-" is a unary negation operator that
70+ * operates on the number "5". Similarly, in the expression "+5", the "+" is a unary
71+ * plus operator.
72+ *
73+ * This function, `is_number()`, is used to determine whether a given character
74+ * sequence represents a number. This includes both signed numbers, where the first
75+ * character is a sign ('-' or '+') followed by a numeric character, as well as
76+ * unsigned numbers, where the character is numeric. It returns true if the first
77+ * character is numeric or if it's a sign and the second character is numeric.
78+ * Otherwise, it returns false.
79+ *
80+ * @param curr_char The current character being evaluated. This could be a numeric
81+ * character or a sign ('+' or '-').
82+ *
83+ * @param next_char The character following the current character. Used for evaluating
84+ * if a sequence is representing a signed number.
85+ *
86+ * @return 1 if is number, 0 otherwise.
87+ */
88+ int is_number (chcode_t curr_char , chcode_t next_char ) {
89+ return (is_numeric (curr_char ) || ((curr_char == '-' || curr_char == '+' ) && is_numeric (next_char )));
90+ }
91+
92+ /**
93+ * As in other languages, some special characters are allowed to create variable names,
94+ * but you are not allowed to start with these characters.
95+ *
96+ * Let's take "!" for example:
97+ *
98+ * ! is an operator character, if the lexer sees a !, it must
99+ * create a specific token for it. Even so, if the language considers "!" as a letter
100+ * (to create var names), so the lexer will tokenize it as a variable. Because of this,
101+ * some languages must block these special characters to be the first letter of the var.
102+ *
103+ * So if the lexer sees any of these special characters, it must create two tokens:
104+ * !my_var: one for "!" and other for "my_var"
105+ *
106+ * If the lexer sees any of these characters after a letter, the lexer must create a single token:
107+ * my_!var: "my_!var"
108+ *
109+ * @param ch The current character being evaluated.
110+ *
111+ * @return 1 if is number, 0 otherwise.
112+ */
113+ int is_allowed_as_first_char (chcode_t ch ) {
114+ return ch != '.' &&
115+ // ch != '-' && // uncomment to allow kebab-case
116+ ch != '?' &&
117+ ch != '!' ;
118+ }
0 commit comments