Skip to content

Commit 8d932de

Browse files
committed
Rename regex flag and field utf16 -> unicode
- rename is_utf16 structure member to is_unicode - rename flag LRE_FLAG_UTF16 as LRE_FLAG_UNICODE
1 parent 97ae6f3 commit 8d932de

File tree

3 files changed

+65
-64
lines changed

3 files changed

+65
-64
lines changed

libregexp.c

Lines changed: 60 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ typedef struct {
6666
const uint8_t *buf_end;
6767
const uint8_t *buf_start;
6868
int re_flags;
69-
BOOL is_utf16;
69+
BOOL is_unicode;
7070
BOOL ignore_case;
7171
BOOL dotall;
7272
int capture_count;
@@ -224,7 +224,7 @@ static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
224224

225225
assert(buf_len >= RE_HEADER_LEN);
226226

227-
re_flags= buf[0];
227+
re_flags = buf[0];
228228
bc_len = get_u32(buf + 3);
229229
assert(bc_len + RE_HEADER_LEN <= buf_len);
230230
printf("flags: 0x%x capture_count=%d stack_size=%d\n",
@@ -696,10 +696,10 @@ static int get_class_atom(REParseState *s, CharRange *cr,
696696
if ((c >= 'a' && c <= 'z') ||
697697
(c >= 'A' && c <= 'Z') ||
698698
(((c >= '0' && c <= '9') || c == '_') &&
699-
inclass && !s->is_utf16)) { /* Annex B.1.4 */
699+
inclass && !s->is_unicode)) { /* Annex B.1.4 */
700700
c &= 0x1f;
701701
p++;
702-
} else if (s->is_utf16) {
702+
} else if (s->is_unicode) {
703703
goto invalid_escape;
704704
} else {
705705
/* otherwise return '\' and 'c' */
@@ -710,7 +710,7 @@ static int get_class_atom(REParseState *s, CharRange *cr,
710710
#ifdef CONFIG_ALL_UNICODE
711711
case 'p':
712712
case 'P':
713-
if (s->is_utf16) {
713+
if (s->is_unicode) {
714714
if (parse_unicode_property(s, cr, &p, (c == 'P')))
715715
return -1;
716716
c = CLASS_RANGE_BASE;
@@ -720,14 +720,14 @@ static int get_class_atom(REParseState *s, CharRange *cr,
720720
#endif
721721
default:
722722
p--;
723-
ret = lre_parse_escape(&p, s->is_utf16 * 2);
723+
ret = lre_parse_escape(&p, s->is_unicode * 2);
724724
if (ret >= 0) {
725725
c = ret;
726726
} else {
727727
if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) {
728728
/* always valid to escape these characters */
729729
goto normal_char;
730-
} else if (s->is_utf16) {
730+
} else if (s->is_unicode) {
731731
invalid_escape:
732732
return re_parse_error(s, "invalid escape sequence in regular expression");
733733
} else {
@@ -749,7 +749,7 @@ static int get_class_atom(REParseState *s, CharRange *cr,
749749
/* normal char */
750750
if (c >= 128) {
751751
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
752-
if ((unsigned)c > 0xffff && !s->is_utf16) {
752+
if ((unsigned)c > 0xffff && !s->is_unicode) {
753753
/* XXX: should handle non BMP-1 code points */
754754
return re_parse_error(s, "malformed unicode char");
755755
}
@@ -811,11 +811,13 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
811811
cr_init(cr, s->opaque, lre_realloc);
812812
p = *pp;
813813
p++; /* skip '[' */
814+
814815
invert = FALSE;
815816
if (*p == '^') {
816817
p++;
817818
invert = TRUE;
818819
}
820+
819821
for(;;) {
820822
if (*p == ']')
821823
break;
@@ -825,7 +827,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
825827
if (*p == '-' && p[1] != ']') {
826828
const uint8_t *p0 = p + 1;
827829
if (c1 >= CLASS_RANGE_BASE) {
828-
if (s->is_utf16) {
830+
if (s->is_unicode) {
829831
cr_free(cr1);
830832
goto invalid_class_range;
831833
}
@@ -837,7 +839,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
837839
goto fail;
838840
if (c2 >= CLASS_RANGE_BASE) {
839841
cr_free(cr1);
840-
if (s->is_utf16) {
842+
if (s->is_unicode) {
841843
goto invalid_class_range;
842844
}
843845
/* Annex B: match '-' character */
@@ -866,7 +868,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
866868
}
867869
}
868870
if (s->ignore_case) {
869-
if (cr_regexp_canonicalize(cr, s->is_utf16))
871+
if (cr_regexp_canonicalize(cr, s->is_unicode))
870872
goto memory_error;
871873
}
872874
if (invert) {
@@ -1161,7 +1163,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
11611163
re_emit_op(s, REOP_prev);
11621164
break;
11631165
case '{':
1164-
if (s->is_utf16) {
1166+
if (s->is_unicode) {
11651167
return re_parse_error(s, "syntax error");
11661168
} else if (!is_digit(p[1])) {
11671169
/* Annex B: we accept '{' not followed by digits as a
@@ -1213,7 +1215,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
12131215
lookahead:
12141216
/* Annex B allows lookahead to be used as an atom for
12151217
the quantifiers */
1216-
if (!s->is_utf16 && !is_backward_lookahead) {
1218+
if (!s->is_unicode && !is_backward_lookahead) {
12171219
last_atom_start = s->byte_code.size;
12181220
last_capture_count = s->capture_count;
12191221
}
@@ -1289,15 +1291,15 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
12891291
/* annex B: we tolerate invalid group names in non
12901292
unicode mode if there is no named capture
12911293
definition */
1292-
if (s->is_utf16 || re_has_named_captures(s))
1294+
if (s->is_unicode || re_has_named_captures(s))
12931295
return re_parse_error(s, "expecting group name");
12941296
else
12951297
goto parse_class_atom;
12961298
}
12971299
p1 += 3;
12981300
if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
12991301
&p1)) {
1300-
if (s->is_utf16 || re_has_named_captures(s))
1302+
if (s->is_unicode || re_has_named_captures(s))
13011303
return re_parse_error(s, "invalid group name");
13021304
else
13031305
goto parse_class_atom;
@@ -1308,7 +1310,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
13081310
after (inefficient, but hopefully not common */
13091311
c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
13101312
if (c < 0) {
1311-
if (s->is_utf16 || re_has_named_captures(s))
1313+
if (s->is_unicode || re_has_named_captures(s))
13121314
return re_parse_error(s, "group name not defined");
13131315
else
13141316
goto parse_class_atom;
@@ -1320,7 +1322,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
13201322
case '0':
13211323
p += 2;
13221324
c = 0;
1323-
if (s->is_utf16) {
1325+
if (s->is_unicode) {
13241326
if (is_digit(*p)) {
13251327
return re_parse_error(s, "invalid decimal escape in regular expression");
13261328
}
@@ -1342,7 +1344,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
13421344

13431345
c = parse_digits(&p, FALSE);
13441346
if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
1345-
if (!s->is_utf16) {
1347+
if (!s->is_unicode) {
13461348
/* Annex B.1.4: accept legacy octal */
13471349
p = q;
13481350
if (*p <= '7') {
@@ -1384,7 +1386,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
13841386
break;
13851387
case ']':
13861388
case '}':
1387-
if (s->is_utf16)
1389+
if (s->is_unicode)
13881390
return re_parse_error(s, "syntax error");
13891391
goto parse_class_atom;
13901392
default:
@@ -1406,7 +1408,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
14061408
return -1;
14071409
} else {
14081410
if (s->ignore_case)
1409-
c = lre_canonicalize(c, s->is_utf16);
1411+
c = lre_canonicalize(c, s->is_unicode);
14101412
if (c <= 0xffff)
14111413
re_emit_op_u16(s, REOP_char, c);
14121414
else
@@ -1442,7 +1444,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
14421444
/* As an extension (see ES6 annex B), we accept '{' not
14431445
followed by digits as a normal atom */
14441446
if (!is_digit(p[1])) {
1445-
if (s->is_utf16)
1447+
if (s->is_unicode)
14461448
goto invalid_quant_count;
14471449
break;
14481450
}
@@ -1461,7 +1463,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
14611463
quant_max = INT32_MAX; /* infinity */
14621464
}
14631465
}
1464-
if (*p != '}' && !s->is_utf16) {
1466+
if (*p != '}' && !s->is_unicode) {
14651467
/* Annex B: normal atom if invalid '{' syntax */
14661468
p = p1;
14671469
break;
@@ -1753,7 +1755,7 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
17531755
s->buf_end = s->buf_ptr + buf_len;
17541756
s->buf_start = s->buf_ptr;
17551757
s->re_flags = re_flags;
1756-
s->is_utf16 = ((re_flags & LRE_FLAG_UTF16) != 0);
1758+
s->is_unicode = ((re_flags & LRE_FLAG_UNICODE) != 0);
17571759
is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
17581760
s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
17591761
s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
@@ -1861,11 +1863,11 @@ static BOOL is_word_char(uint32_t c)
18611863
} \
18621864
} while (0)
18631865

1864-
#define PEEK_CHAR(c, cptr, cbuf_end) \
1865-
do { \
1866-
if (cbuf_type == 0) { \
1867-
c = cptr[0]; \
1868-
} else { \
1866+
#define PEEK_CHAR(c, cptr, cbuf_end) \
1867+
do { \
1868+
if (cbuf_type == 0) { \
1869+
c = cptr[0]; \
1870+
} else { \
18691871
uint32_t __c1; \
18701872
c = ((uint16_t *)cptr)[0]; \
18711873
if (c >= 0xd800 && c < 0xdc00 && \
@@ -1875,18 +1877,18 @@ static BOOL is_word_char(uint32_t c)
18751877
c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
18761878
} \
18771879
} \
1878-
} \
1880+
} \
18791881
} while (0)
18801882

1881-
#define PEEK_PREV_CHAR(c, cptr, cbuf_start) \
1882-
do { \
1883-
if (cbuf_type == 0) { \
1884-
c = cptr[-1]; \
1885-
} else { \
1883+
#define PEEK_PREV_CHAR(c, cptr, cbuf_start) \
1884+
do { \
1885+
if (cbuf_type == 0) { \
1886+
c = cptr[-1]; \
1887+
} else { \
18861888
uint32_t __c1; \
18871889
c = ((uint16_t *)cptr)[-1]; \
18881890
if (c >= 0xdc00 && c < 0xe000 && \
1889-
cbuf_type == 2 && (cptr - 4) >= cbuf_start) { \
1891+
cbuf_type == 2 && (cptr - 4) >= cbuf_start) { \
18901892
__c1 = ((uint16_t *)cptr)[-2]; \
18911893
if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \
18921894
c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
@@ -1895,15 +1897,15 @@ static BOOL is_word_char(uint32_t c)
18951897
} \
18961898
} while (0)
18971899

1898-
#define GET_PREV_CHAR(c, cptr, cbuf_start) \
1899-
do { \
1900-
if (cbuf_type == 0) { \
1901-
cptr--; \
1902-
c = cptr[0]; \
1903-
} else { \
1900+
#define GET_PREV_CHAR(c, cptr, cbuf_start) \
1901+
do { \
1902+
if (cbuf_type == 0) { \
1903+
cptr--; \
1904+
c = cptr[0]; \
1905+
} else { \
19041906
uint32_t __c1; \
19051907
cptr -= 2; \
1906-
c = ((uint16_t *)cptr)[0]; \
1908+
c = ((uint16_t *)cptr)[0]; \
19071909
if (c >= 0xdc00 && c < 0xe000 && \
19081910
cbuf_type == 2 && cptr > cbuf_start) { \
19091911
__c1 = ((uint16_t *)cptr)[-1]; \
@@ -1915,12 +1917,12 @@ static BOOL is_word_char(uint32_t c)
19151917
} \
19161918
} while (0)
19171919

1918-
#define PREV_CHAR(cptr, cbuf_start) \
1919-
do { \
1920-
if (cbuf_type == 0) { \
1921-
cptr--; \
1922-
} else { \
1923-
cptr -= 2; \
1920+
#define PREV_CHAR(cptr, cbuf_start) \
1921+
do { \
1922+
if (cbuf_type == 0) { \
1923+
cptr--; \
1924+
} else { \
1925+
cptr -= 2; \
19241926
if (cbuf_type == 2) { \
19251927
c = ((uint16_t *)cptr)[0]; \
19261928
if (c >= 0xdc00 && c < 0xe000 && cptr > cbuf_start) { \
@@ -1959,7 +1961,7 @@ typedef struct {
19591961
int stack_size_max;
19601962
BOOL multi_line;
19611963
BOOL ignore_case;
1962-
BOOL is_utf16;
1964+
BOOL is_unicode;
19631965
void *opaque; /* used for stack overflow check */
19641966

19651967
size_t state_size;
@@ -2105,7 +2107,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
21052107
goto no_match;
21062108
GET_CHAR(c, cptr, cbuf_end);
21072109
if (s->ignore_case) {
2108-
c = lre_canonicalize(c, s->is_utf16);
2110+
c = lre_canonicalize(c, s->is_unicode);
21092111
}
21102112
if (val != c)
21112113
goto no_match;
@@ -2260,8 +2262,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
22602262
GET_CHAR(c1, cptr1, cptr1_end);
22612263
GET_CHAR(c2, cptr, cbuf_end);
22622264
if (s->ignore_case) {
2263-
c1 = lre_canonicalize(c1, s->is_utf16);
2264-
c2 = lre_canonicalize(c2, s->is_utf16);
2265+
c1 = lre_canonicalize(c1, s->is_unicode);
2266+
c2 = lre_canonicalize(c2, s->is_unicode);
22652267
}
22662268
if (c1 != c2)
22672269
goto no_match;
@@ -2274,8 +2276,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
22742276
GET_PREV_CHAR(c1, cptr1, cptr1_start);
22752277
GET_PREV_CHAR(c2, cptr, s->cbuf);
22762278
if (s->ignore_case) {
2277-
c1 = lre_canonicalize(c1, s->is_utf16);
2278-
c2 = lre_canonicalize(c2, s->is_utf16);
2279+
c1 = lre_canonicalize(c1, s->is_unicode);
2280+
c2 = lre_canonicalize(c2, s->is_unicode);
22792281
}
22802282
if (c1 != c2)
22812283
goto no_match;
@@ -2294,7 +2296,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
22942296
goto no_match;
22952297
GET_CHAR(c, cptr, cbuf_end);
22962298
if (s->ignore_case) {
2297-
c = lre_canonicalize(c, s->is_utf16);
2299+
c = lre_canonicalize(c, s->is_unicode);
22982300
}
22992301
idx_min = 0;
23002302
low = get_u16(pc + 0 * 4);
@@ -2334,7 +2336,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
23342336
goto no_match;
23352337
GET_CHAR(c, cptr, cbuf_end);
23362338
if (s->ignore_case) {
2337-
c = lre_canonicalize(c, s->is_utf16);
2339+
c = lre_canonicalize(c, s->is_unicode);
23382340
}
23392341
idx_min = 0;
23402342
low = get_u32(pc + 0 * 8);
@@ -2426,13 +2428,13 @@ int lre_exec(uint8_t **capture,
24262428
re_flags = bc_buf[RE_HEADER_FLAGS];
24272429
s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
24282430
s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
2429-
s->is_utf16 = (re_flags & LRE_FLAG_UTF16) != 0;
2431+
s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0;
24302432
s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
24312433
s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
24322434
s->cbuf = cbuf;
24332435
s->cbuf_end = cbuf + (clen << cbuf_type);
24342436
s->cbuf_type = cbuf_type;
2435-
if (s->cbuf_type == 1 && s->is_utf16)
2437+
if (s->cbuf_type == 1 && s->is_unicode)
24362438
s->cbuf_type = 2;
24372439
s->opaque = opaque;
24382440

libregexp.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,9 @@
3434
#define LRE_FLAG_IGNORECASE (1 << 1)
3535
#define LRE_FLAG_MULTILINE (1 << 2)
3636
#define LRE_FLAG_DOTALL (1 << 3)
37-
#define LRE_FLAG_UTF16 (1 << 4)
37+
#define LRE_FLAG_UNICODE (1 << 4)
3838
#define LRE_FLAG_STICKY (1 << 5)
3939
#define LRE_FLAG_INDICES (1 << 6) /* Unused by libregexp, just recorded. */
40-
4140
#define LRE_FLAG_NAMED_GROUPS (1 << 7) /* named groups are present in the regexp */
4241

4342
uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,

0 commit comments

Comments
 (0)