33//! This module handles the parsing of ECMAScript/TypeScript identifiers.
44
55use swc_atoms:: Atom ;
6+ use unicode_id_start:: { is_id_continue_unicode, is_id_start_unicode} ;
67
78use super :: Lexer ;
89use crate :: {
910 error:: Result ,
1011 token:: { keyword_to_token_type, Token , TokenType , TokenValue } ,
12+ util:: likely,
1113} ;
1214
1315/// Fast mapping from ASCII to check if a character is valid for identifier
@@ -65,15 +67,27 @@ impl Lexer<'_> {
6567 self . cursor . advance ( ) ;
6668
6769 // Read as many identifier continue chars as possible
68- self . cursor . advance_while ( Self :: is_identifier_continue ) ;
70+ self . cursor . advance_while ( Self :: is_ascii_id_continue ) ;
6971
7072 // Extract the identifier text
71- let span = self . span ( ) ;
7273 let ident_start = start_pos. 0 ;
7374 let ident_end = self . cursor . position ( ) ;
7475 let ident_bytes = unsafe { self . cursor . slice_unchecked ( ident_start, ident_end) } ;
75- let ident_str = unsafe { std:: str:: from_utf8_unchecked ( ident_bytes) } ;
76+ let non_unicode_ident_str = unsafe { std:: str:: from_utf8_unchecked ( ident_bytes) } ;
77+
78+ let ident_str = if let Some ( ch) = self . cursor . peek ( ) {
79+ if ch == b'\\' {
80+ & self . read_identifier_with_unicode_escape ( non_unicode_ident_str) ?
81+ } else if !ch. is_ascii ( ) {
82+ & self . read_identifier_with_utf8_charater ( non_unicode_ident_str) ?
83+ } else {
84+ non_unicode_ident_str
85+ }
86+ } else {
87+ non_unicode_ident_str
88+ } ;
7689 let had_line_break_bool: bool = self . had_line_break . into ( ) ;
90+ let span = self . span ( ) ;
7791
7892 // For non-keyword identifiers, we can directly return without checking keyword
7993 // maps
@@ -94,20 +108,32 @@ impl Lexer<'_> {
94108 self . cursor . advance ( ) ;
95109
96110 // Read as many identifier continue chars as possible
97- self . cursor . advance_while ( Self :: is_identifier_continue ) ;
111+ self . cursor . advance_while ( Self :: is_ascii_id_continue ) ;
98112
99113 // Extract the identifier text
100- let span = self . span ( ) ;
101114 let ident_start = start_pos. 0 ;
102115 let ident_end = self . cursor . position ( ) ;
103- let ident_bytes = unsafe { self . cursor . slice_unchecked ( ident_start, ident_end) } ;
104- // SAFETY: We've verified the bytes are valid UTF-8
105- let ident_str = unsafe { std:: str:: from_utf8_unchecked ( ident_bytes) } ;
106116 let had_line_break_bool: bool = self . had_line_break . into ( ) ;
107-
117+ let non_unicode_ident_str = unsafe {
118+ std:: str:: from_utf8_unchecked ( self . cursor . slice_unchecked ( ident_start, ident_end) )
119+ } ;
120+
121+ let ident_str = if let Some ( ch) = self . cursor . peek ( ) {
122+ if ch == b'\\' {
123+ & self . read_identifier_with_unicode_escape ( non_unicode_ident_str) ?
124+ } else if !ch. is_ascii ( ) {
125+ & self . read_identifier_with_utf8_charater ( non_unicode_ident_str) ?
126+ } else {
127+ non_unicode_ident_str
128+ }
129+ } else {
130+ non_unicode_ident_str
131+ } ;
108132 // Ultra-fast path for common 2-6 letter keywords using direct table lookup
109- let len = ident_bytes. len ( ) ;
133+ let ident_bytes = ident_str. as_bytes ( ) ;
134+ let len = ident_str. len ( ) ;
110135
136+ let span = self . span ( ) ;
111137 // Only process if first byte is an ASCII lowercase letter (all keywords start
112138 // with a-z)
113139 if len > 0 && ident_bytes[ 0 ] >= b'a' && ident_bytes[ 0 ] <= b'z' {
@@ -131,6 +157,46 @@ impl Lexer<'_> {
131157 ) )
132158 }
133159
160+ fn read_identifier_with_unicode_escape ( & mut self , non_unicode : & str ) -> Result < String > {
161+ let mut buffer = String :: from ( non_unicode) ;
162+ self . identifier_with_unicode_escape_part ( & mut buffer) ?;
163+
164+ Ok ( buffer)
165+ }
166+
167+ fn identifier_with_unicode_escape_part ( & mut self , buffer : & mut String ) -> Result < ( ) > {
168+ while let Some ( ch) = self . cursor . peek_char ( ) {
169+ if ch == '\\' && self . cursor . peek_at ( 1 ) == Some ( b'u' ) {
170+ // Skip the "\\u"
171+ self . cursor . advance_n ( 2 ) ;
172+ let unicode_escape = self . read_unicode_escape ( ) ?;
173+ buffer. push ( unicode_escape) ;
174+ } else if Self :: is_identifier_continue ( ch) {
175+ buffer. push ( ch) ;
176+ self . cursor . advance_char ( ) ;
177+ } else {
178+ break ;
179+ }
180+ }
181+ Ok ( ( ) )
182+ }
183+
184+ fn read_identifier_with_utf8_charater ( & mut self , non_unicode : & str ) -> Result < String > {
185+ let mut buffer = String :: from ( non_unicode) ;
186+ while let Some ( ch) = self . cursor . peek_char ( ) {
187+ if likely ( Self :: is_identifier_continue ( ch) ) {
188+ buffer. push ( ch) ;
189+ self . cursor . advance_char ( ) ;
190+ } else if ch == '\\' {
191+ self . identifier_with_unicode_escape_part ( & mut buffer) ?;
192+ } else {
193+ break ;
194+ }
195+ }
196+
197+ Ok ( buffer)
198+ }
199+
134200 /// Super fast check for ASCII identifier start character
135201 #[ inline( always) ]
136202 pub ( crate ) fn is_ascii_id_start ( ch : u8 ) -> bool {
@@ -142,4 +208,26 @@ impl Lexer<'_> {
142208 pub ( crate ) fn is_ascii_id_continue ( ch : u8 ) -> bool {
143209 ch < 128 && unsafe { ( IDENT_CHAR . get_unchecked ( ch as usize ) & 2 ) != 0 }
144210 }
211+
212+ /// Check if a byte is a valid identifier start character
213+ #[ inline( always) ]
214+ pub ( crate ) fn is_identifier_start ( ch : char ) -> bool {
215+ // ASCII fast path using optimized identifier functions
216+ if likely ( ch. is_ascii ( ) ) {
217+ Self :: is_ascii_id_start ( ch as u8 )
218+ } else {
219+ is_id_start_unicode ( ch)
220+ }
221+ }
222+
223+ /// Check if a byte is a valid identifier continue character
224+ #[ inline( always) ]
225+ pub ( crate ) fn is_identifier_continue ( ch : char ) -> bool {
226+ // ASCII fast path using optimized identifier functions
227+ if likely ( ch. is_ascii ( ) ) {
228+ Self :: is_ascii_id_continue ( ch as u8 )
229+ } else {
230+ is_id_continue_unicode ( ch)
231+ }
232+ }
145233}
0 commit comments