swc-project
diff --git a/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.lock‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎crates/swc_ecma_fast_parser/Cargo.toml‎
Lines changed: 5 additions & 4 deletions b/‎crates/swc_ecma_fast_parser/Cargo.toml‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎crates/swc_ecma_fast_parser/src/lexer/cursor.rs‎
Lines changed: 27 additions & 0 deletions b/‎crates/swc_ecma_fast_parser/src/lexer/cursor.rs‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎crates/swc_ecma_fast_parser/src/lexer/identifier.rs‎
Lines changed: 98 additions & 10 deletions b/‎crates/swc_ecma_fast_parser/src/lexer/identifier.rs‎
Lines changed: 98 additions & 10 deletions
diff --git a/‎crates/swc_ecma_fast_parser/src/lexer/mod.rs‎
Lines changed: 1 addition & 25 deletions b/‎crates/swc_ecma_fast_parser/src/lexer/mod.rs‎
Lines changed: 1 addition & 25 deletions
diff --git a/‎crates/swc_ecma_fast_parser/src/lexer/regex.rs‎
Lines changed: 1 addition & 1 deletion b/‎crates/swc_ecma_fast_parser/src/lexer/regex.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/swc_ecma_fast_parser/src/lexer/tests.rs‎
Lines changed: 21 additions & 0 deletions b/‎crates/swc_ecma_fast_parser/src/lexer/tests.rs‎
Lines changed: 21 additions & 0 deletions
@@ -18,10 +18,11 @@ swc_atoms = { version = "5.0.0", path = "../swc_atoms" }
 swc_common = { version = "8.0.0", path = "../swc_common" }
 swc_ecma_ast = { version = "8.0.1", path = "../swc_ecma_ast" }
 
-assume = { workspace = true }
-memchr = { workspace = true }
-num-bigint = { workspace = true }
-wide = { workspace = true }
+assume = { workspace = true }
+memchr = { workspace = true }
+num-bigint = { workspace = true }
+unicode-id-start = { workspace = true }
+wide = { workspace = true }
 
 [dev-dependencies]
 criterion = { workspace = true }
 
@@ -55,6 +55,19 @@ impl<'a> Cursor<'a> {
  }
  }
 
+ /// Peek at the current character without advancing
+ #[inline(always)]
+ pub fn peek_char(&self) -> Option<char> {
+ self.peek().and_then(|b| {
+ if b.is_ascii() {
+ Some(b as char)
+ } else {
+ let rest_str = unsafe { std::str::from_utf8_unchecked(self.rest()) };
+ rest_str.chars().next()
+ }
+ })
+ }
+
  /// Peek at a byte at a specific offset from the current position
  #[inline(always)]
  pub fn peek_at(&self, offset: u32) -> Option<u8> {
@@ -82,6 +95,20 @@ impl<'a> Cursor<'a> {
  self.pos += 1;
  }
 
+ /// Advance the cursor by one character
+ #[inline(always)]
+ pub fn advance_char(&mut self) {
+ assume!(unsafe: !self.is_eof());
+ let byte = self.peek().unwrap();
+ if byte.is_ascii() {
+ self.advance();
+ } else {
+ let rest_str = unsafe { std::str::from_utf8_unchecked(self.rest()) };
+ let ch = rest_str.chars().next().unwrap();
+ self.advance_n(ch.len_utf8() as u32);
+ }
+ }
+
  /// Advance the cursor by n bytes
  #[inline(always)]
  pub fn advance_n(&mut self, n: u32) {
 
@@ -3,11 +3,13 @@
 //! This module handles the parsing of ECMAScript/TypeScript identifiers.
 
 use swc_atoms::Atom;
+use unicode_id_start::{is_id_continue_unicode, is_id_start_unicode};
 
 use super::Lexer;
 use crate::{
  error::Result,
  token::{keyword_to_token_type, Token, TokenType, TokenValue},
+ util::likely,
 };
 
 /// Fast mapping from ASCII to check if a character is valid for identifier
@@ -65,15 +67,27 @@ impl Lexer<'_> {
  self.cursor.advance();
 
  // Read as many identifier continue chars as possible
- self.cursor.advance_while(Self::is_identifier_continue);
+ self.cursor.advance_while(Self::is_ascii_id_continue);
 
  // Extract the identifier text
- let span = self.span();
  let ident_start = start_pos.0;
  let ident_end = self.cursor.position();
  let ident_bytes = unsafe { self.cursor.slice_unchecked(ident_start, ident_end) };
- let ident_str = unsafe { std::str::from_utf8_unchecked(ident_bytes) };
+ let non_unicode_ident_str = unsafe { std::str::from_utf8_unchecked(ident_bytes) };
+
+ let ident_str = if let Some(ch) = self.cursor.peek() {
+ if ch == b'\\' {
+ &self.read_identifier_with_unicode_escape(non_unicode_ident_str)?
+ } else if !ch.is_ascii() {
+ &self.read_identifier_with_utf8_charater(non_unicode_ident_str)?
+ } else {
+ non_unicode_ident_str
+ }
+ } else {
+ non_unicode_ident_str
+ };
  let had_line_break_bool: bool = self.had_line_break.into();
+ let span = self.span();
 
  // For non-keyword identifiers, we can directly return without checking keyword
  // maps
@@ -94,20 +108,32 @@ impl Lexer<'_> {
  self.cursor.advance();
 
  // Read as many identifier continue chars as possible
- self.cursor.advance_while(Self::is_identifier_continue);
+ self.cursor.advance_while(Self::is_ascii_id_continue);
 
  // Extract the identifier text
- let span = self.span();
  let ident_start = start_pos.0;
  let ident_end = self.cursor.position();
- let ident_bytes = unsafe { self.cursor.slice_unchecked(ident_start, ident_end) };
- // SAFETY: We've verified the bytes are valid UTF-8
- let ident_str = unsafe { std::str::from_utf8_unchecked(ident_bytes) };
  let had_line_break_bool: bool = self.had_line_break.into();
-
+ let non_unicode_ident_str = unsafe {
+ std::str::from_utf8_unchecked(self.cursor.slice_unchecked(ident_start, ident_end))
+ };
+
+ let ident_str = if let Some(ch) = self.cursor.peek() {
+ if ch == b'\\' {
+ &self.read_identifier_with_unicode_escape(non_unicode_ident_str)?
+ } else if !ch.is_ascii() {
+ &self.read_identifier_with_utf8_charater(non_unicode_ident_str)?
+ } else {
+ non_unicode_ident_str
+ }
+ } else {
+ non_unicode_ident_str
+ };
  // Ultra-fast path for common 2-6 letter keywords using direct table lookup
- let len = ident_bytes.len();
+ let ident_bytes = ident_str.as_bytes();
+ let len = ident_str.len();
 
+ let span = self.span();
  // Only process if first byte is an ASCII lowercase letter (all keywords start
  // with a-z)
  if len > 0 && ident_bytes[0] >= b'a' && ident_bytes[0] <= b'z' {
@@ -131,6 +157,46 @@ impl Lexer<'_> {
  ))
  }
 
+ fn read_identifier_with_unicode_escape(&mut self, non_unicode: &str) -> Result<String> {
+ let mut buffer = String::from(non_unicode);
+ self.identifier_with_unicode_escape_part(&mut buffer)?;
+
+ Ok(buffer)
+ }
+
+ fn identifier_with_unicode_escape_part(&mut self, buffer: &mut String) -> Result<()> {
+ while let Some(ch) = self.cursor.peek_char() {
+ if ch == '\\' && self.cursor.peek_at(1) == Some(b'u') {
+ // Skip the "\\u"
+ self.cursor.advance_n(2);
+ let unicode_escape = self.read_unicode_escape()?;
+ buffer.push(unicode_escape);
+ } else if Self::is_identifier_continue(ch) {
+ buffer.push(ch);
+ self.cursor.advance_char();
+ } else {
+ break;
+ }
+ }
+ Ok(())
+ }
+
+ fn read_identifier_with_utf8_charater(&mut self, non_unicode: &str) -> Result<String> {
+ let mut buffer = String::from(non_unicode);
+ while let Some(ch) = self.cursor.peek_char() {
+ if likely(Self::is_identifier_continue(ch)) {
+ buffer.push(ch);
+ self.cursor.advance_char();
+ } else if ch == '\\' {
+ self.identifier_with_unicode_escape_part(&mut buffer)?;
+ } else {
+ break;
+ }
+ }
+
+ Ok(buffer)
+ }
+
  /// Super fast check for ASCII identifier start character
  #[inline(always)]
  pub(crate) fn is_ascii_id_start(ch: u8) -> bool {
@@ -142,4 +208,26 @@ impl Lexer<'_> {
  pub(crate) fn is_ascii_id_continue(ch: u8) -> bool {
  ch < 128 && unsafe { (IDENT_CHAR.get_unchecked(ch as usize) & 2) != 0 }
  }
+
+ /// Check if a byte is a valid identifier start character
+ #[inline(always)]
+ pub(crate) fn is_identifier_start(ch: char) -> bool {
+ // ASCII fast path using optimized identifier functions
+ if likely(ch.is_ascii()) {
+ Self::is_ascii_id_start(ch as u8)
+ } else {
+ is_id_start_unicode(ch)
+ }
+ }
+
+ /// Check if a byte is a valid identifier continue character
+ #[inline(always)]
+ pub(crate) fn is_identifier_continue(ch: char) -> bool {
+ // ASCII fast path using optimized identifier functions
+ if likely(ch.is_ascii()) {
+ Self::is_ascii_id_continue(ch as u8)
+ } else {
+ is_id_continue_unicode(ch)
+ }
+ }
 }
@@ -393,7 +393,7 @@ impl<'a> Lexer<'a> {
  }
  } else {
  // Non-ASCII character path (less common)
- if Self::is_identifier_start(ch) {
+ if Self::is_identifier_start(ch as char) {
  self.read_non_keyword_identifier()
  } else {
  self.cursor.advance();
@@ -717,28 +717,4 @@ impl<'a> Lexer<'a> {
  self.had_line_break = LineBreak::Present;
  }
  }
-
- /// Check if a byte is a valid identifier start character
- #[inline(always)]
- fn is_identifier_start(byte: u8) -> bool {
- // ASCII fast path using optimized identifier functions
- if likely(byte < 128) {
- Self::is_ascii_id_start(byte)
- } else {
- // Non-ASCII, needs further checking in read_identifier
- true
- }
- }
-
- /// Check if a byte is a valid identifier continue character
- #[inline(always)]
- fn is_identifier_continue(byte: u8) -> bool {
- // ASCII fast path using optimized identifier functions
- if likely(byte < 128) {
- Self::is_ascii_id_continue(byte)
- } else {
- // Non-ASCII, needs further checking in read_identifier
- true
- }
- }
 }
@@ -83,7 +83,7 @@ impl Lexer<'_> {
  // Read the flags
  let mut flags = String::new();
  while let Some(ch) = self.cursor.peek() {
- if Self::is_identifier_continue(ch) {
+ if Self::is_ascii_id_continue(ch) {
  flags.push(ch as char);
  self.cursor.advance();
  } else {
 
@@ -1295,6 +1295,27 @@ fn test_lexer_number_edge_cases() {
  );
 }
 
+#[test]
+fn test_lexer_identifier_with_unicode() {
+ // Test identifier with Unicode characters
+ verify_tokens(
+ "a你好",
+ vec![(
+ TokenType::Ident,
+ Some(TokenValue::Word(Atom::from("a你好"))),
+ )],
+ );
+
+ // Test identifier with Unicode escape sequence (code point)
+ verify_tokens(
+ "e\\u{0061}स्ते\\u{0062}",
+ vec![(
+ TokenType::Ident,
+ Some(TokenValue::Word(Atom::from("eaस्तेb"))),
+ )],
+ );
+}
+
 #[test]
 #[should_panic]
 fn test_lexer_invalid_binary_number() {