@@ -18,7 +18,6 @@ use log::debug;
1818
1919use  rustc_data_structures:: fx:: FxHashSet ; 
2020use  std:: borrow:: Cow ; 
21- use  std:: iter; 
2221use  std:: path:: { Path ,  PathBuf } ; 
2322use  std:: str; 
2423
@@ -34,6 +33,11 @@ pub mod diagnostics;
3433
3534pub  mod  classify; 
3635
36+ pub ( crate )  mod  unescape; 
37+ use  unescape:: { unescape_str,  unescape_char,  unescape_byte_str,  unescape_byte} ; 
38+ 
39+ pub ( crate )  mod  unescape_error_reporting; 
40+ 
3741/// Info about a parsing session. 
3842pub  struct  ParseSess  { 
3943 pub  span_diagnostic :  Handler , 
@@ -307,133 +311,6 @@ pub fn stream_to_parser(sess: &ParseSess, stream: TokenStream) -> Parser<'_> {
307311 Parser :: new ( sess,  stream,  None ,  true ,  false ) 
308312} 
309313
310- /// Parses a string representing a character literal into its final form. 
311- /// Rather than just accepting/rejecting a given literal, unescapes it as 
312- /// well. Can take any slice prefixed by a character escape. Returns the 
313- /// character and the number of characters consumed. 
314- fn  char_lit ( lit :  & str ,  diag :  Option < ( Span ,  & Handler ) > )  -> ( char ,  isize )  { 
315-  use  std:: char; 
316- 
317-  // Handle non-escaped chars first. 
318-  if  lit. as_bytes ( ) [ 0 ]  != b'\\'  { 
319-  // If the first byte isn't '\\' it might part of a multi-byte char, so 
320-  // get the char with chars(). 
321-  let  c = lit. chars ( ) . next ( ) . unwrap ( ) ; 
322-  return  ( c,  1 ) ; 
323-  } 
324- 
325-  // Handle escaped chars. 
326-  match  lit. as_bytes ( ) [ 1 ]  as  char  { 
327-  '"'  => ( '"' ,  2 ) , 
328-  'n'  => ( '\n' ,  2 ) , 
329-  'r'  => ( '\r' ,  2 ) , 
330-  't'  => ( '\t' ,  2 ) , 
331-  '\\'  => ( '\\' ,  2 ) , 
332-  '\''  => ( '\'' ,  2 ) , 
333-  '0'  => ( '\0' ,  2 ) , 
334-  'x'  => { 
335-  let  v = u32:: from_str_radix ( & lit[ 2 ..4 ] ,  16 ) . unwrap ( ) ; 
336-  let  c = char:: from_u32 ( v) . unwrap ( ) ; 
337-  ( c,  4 ) 
338-  } 
339-  'u'  => { 
340-  assert_eq ! ( lit. as_bytes( ) [ 2 ] ,  b'{' ) ; 
341-  let  idx = lit. find ( '}' ) . unwrap ( ) ; 
342- 
343-  // All digits and '_' are ascii, so treat each byte as a char. 
344-  let  mut  v:  u32  = 0 ; 
345-  for  c in  lit[ 3 ..idx] . bytes ( )  { 
346-  let  c = char:: from ( c) ; 
347-  if  c != '_'  { 
348-  let  x = c. to_digit ( 16 ) . unwrap ( ) ; 
349-  v = v. checked_mul ( 16 ) . unwrap ( ) . checked_add ( x) . unwrap ( ) ; 
350-  } 
351-  } 
352-  let  c = char:: from_u32 ( v) . unwrap_or_else ( || { 
353-  if  let  Some ( ( span,  diag) )  = diag { 
354-  let  mut  diag = diag. struct_span_err ( span,  "invalid unicode character escape" ) ; 
355-  if  v > 0x10FFFF  { 
356-  diag. help ( "unicode escape must be at most 10FFFF" ) . emit ( ) ; 
357-  }  else  { 
358-  diag. help ( "unicode escape must not be a surrogate" ) . emit ( ) ; 
359-  } 
360-  } 
361-  '\u{FFFD}' 
362-  } ) ; 
363-  ( c,  ( idx + 1 )  as  isize ) 
364-  } 
365-  _ => panic ! ( "lexer should have rejected a bad character escape {}" ,  lit) 
366-  } 
367- } 
368- 
369- /// Parses a string representing a string literal into its final form. Does unescaping. 
370- fn  str_lit ( lit :  & str ,  diag :  Option < ( Span ,  & Handler ) > )  -> String  { 
371-  debug ! ( "str_lit: given {}" ,  lit. escape_default( ) ) ; 
372-  let  mut  res = String :: with_capacity ( lit. len ( ) ) ; 
373- 
374-  let  error = |i| format ! ( "lexer should have rejected {} at {}" ,  lit,  i) ; 
375- 
376-  /// Eat everything up to a non-whitespace. 
377- fn  eat < ' a > ( it :  & mut  iter:: Peekable < str:: CharIndices < ' a > > )  { 
378-  loop  { 
379-  match  it. peek ( ) . map ( |x| x. 1 )  { 
380-  Some ( ' ' )  | Some ( '\n' )  | Some ( '\r' )  | Some ( '\t' )  => { 
381-  it. next ( ) ; 
382-  } , 
383-  _ => {  break ;  } 
384-  } 
385-  } 
386-  } 
387- 
388-  let  mut  chars = lit. char_indices ( ) . peekable ( ) ; 
389-  while  let  Some ( ( i,  c) )  = chars. next ( )  { 
390-  match  c { 
391-  '\\'  => { 
392-  let  ch = chars. peek ( ) . unwrap_or_else ( || { 
393-  panic ! ( "{}" ,  error( i) ) 
394-  } ) . 1 ; 
395- 
396-  if  ch == '\n'  { 
397-  eat ( & mut  chars) ; 
398-  }  else  if  ch == '\r'  { 
399-  chars. next ( ) ; 
400-  let  ch = chars. peek ( ) . unwrap_or_else ( || { 
401-  panic ! ( "{}" ,  error( i) ) 
402-  } ) . 1 ; 
403- 
404-  if  ch != '\n'  { 
405-  panic ! ( "lexer accepted bare CR" ) ; 
406-  } 
407-  eat ( & mut  chars) ; 
408-  }  else  { 
409-  // otherwise, a normal escape 
410-  let  ( c,  n)  = char_lit ( & lit[ i..] ,  diag) ; 
411-  for  _ in  0 ..n - 1  {  // we don't need to move past the first \ 
412-  chars. next ( ) ; 
413-  } 
414-  res. push ( c) ; 
415-  } 
416-  } , 
417-  '\r'  => { 
418-  let  ch = chars. peek ( ) . unwrap_or_else ( || { 
419-  panic ! ( "{}" ,  error( i) ) 
420-  } ) . 1 ; 
421- 
422-  if  ch != '\n'  { 
423-  panic ! ( "lexer accepted bare CR" ) ; 
424-  } 
425-  chars. next ( ) ; 
426-  res. push ( '\n' ) ; 
427-  } 
428-  c => res. push ( c) , 
429-  } 
430-  } 
431- 
432-  res. shrink_to_fit ( ) ;  // probably not going to do anything, unless there was an escape. 
433-  debug ! ( "parse_str_lit: returning {}" ,  res) ; 
434-  res
435- } 
436- 
437314/// Parses a string representing a raw string literal into its final form. The 
438315/// only operation this does is convert embedded CRLF into a single LF. 
439316fn  raw_str_lit ( lit :  & str )  -> String  { 
@@ -476,9 +353,21 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
476353 use  ast:: LitKind ; 
477354
478355 match  lit { 
479-  token:: Byte ( i)  => ( true ,  Some ( LitKind :: Byte ( byte_lit ( & i. as_str ( ) ) . 0 ) ) ) , 
480-  token:: Char ( i)  => ( true ,  Some ( LitKind :: Char ( char_lit ( & i. as_str ( ) ,  diag) . 0 ) ) ) , 
481-  token:: Err ( i)  => ( true ,  Some ( LitKind :: Err ( i) ) ) , 
356+  token:: Byte ( i)  => { 
357+  let  lit_kind = match  unescape_byte ( & i. as_str ( ) )  { 
358+  Ok ( c)  => LitKind :: Byte ( c) , 
359+  Err ( _)  => LitKind :: Err ( i) , 
360+  } ; 
361+  ( true ,  Some ( lit_kind) ) 
362+  } , 
363+  token:: Char ( i)  => { 
364+  let  lit_kind = match  unescape_char ( & i. as_str ( ) )  { 
365+  Ok ( c)  => LitKind :: Char ( c) , 
366+  Err ( _)  => LitKind :: Err ( i) , 
367+  } ; 
368+  ( true ,  Some ( lit_kind) ) 
369+  } , 
370+  token:: Err ( i)  => ( true ,  Some ( LitKind :: Err ( i) ) ) , 
482371
483372 // There are some valid suffixes for integer and float literals, 
484373 // so all the handling is done internally. 
@@ -490,10 +379,22 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
490379 // reuse the symbol from the Token. Otherwise, we must generate a 
491380 // new symbol because the string in the LitKind is different to the 
492381 // string in the Token. 
382+  let  mut  has_error = false ; 
493383 let  s = & sym. as_str ( ) ; 
494384 if  s. as_bytes ( ) . iter ( ) . any ( |& c| c == b'\\'  || c == b'\r' )  { 
495-  sym = Symbol :: intern ( & str_lit ( s,  diag) ) ; 
385+  let  mut  buf = String :: with_capacity ( s. len ( ) ) ; 
386+  unescape_str ( s,  & mut  |_,  unescaped_char| { 
387+  match  unescaped_char { 
388+  Ok ( c)  => buf. push ( c) , 
389+  Err ( _)  => has_error = true , 
390+  } 
391+  } ) ; 
392+  if  has_error { 
393+  return  ( true ,  Some ( LitKind :: Err ( sym) ) ) ; 
394+  } 
395+  sym = Symbol :: intern ( & buf) 
496396 } 
397+ 
497398 ( true ,  Some ( LitKind :: Str ( sym,  ast:: StrStyle :: Cooked ) ) ) 
498399 } 
499400 token:: StrRaw ( mut  sym,  n)  => { 
@@ -505,7 +406,20 @@ crate fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Ha
505406 ( true ,  Some ( LitKind :: Str ( sym,  ast:: StrStyle :: Raw ( n) ) ) ) 
506407 } 
507408 token:: ByteStr ( i)  => { 
508-  ( true ,  Some ( LitKind :: ByteStr ( byte_str_lit ( & i. as_str ( ) ) ) ) ) 
409+  let  s = & i. as_str ( ) ; 
410+  let  mut  buf = Vec :: with_capacity ( s. len ( ) ) ; 
411+  let  mut  has_error = false ; 
412+  unescape_byte_str ( s,  & mut  |_,  unescaped_byte| { 
413+  match  unescaped_byte { 
414+  Ok ( c)  => buf. push ( c) , 
415+  Err ( _)  => has_error = true , 
416+  } 
417+  } ) ; 
418+  if  has_error { 
419+  return  ( true ,  Some ( LitKind :: Err ( i) ) ) ; 
420+  } 
421+  buf. shrink_to_fit ( ) ; 
422+  ( true ,  Some ( LitKind :: ByteStr ( Lrc :: new ( buf) ) ) ) 
509423 } 
510424 token:: ByteStrRaw ( i,  _)  => { 
511425 ( true ,  Some ( LitKind :: ByteStr ( Lrc :: new ( i. to_string ( ) . into_bytes ( ) ) ) ) ) 
@@ -560,95 +474,6 @@ fn float_lit(s: &str, suffix: Option<Symbol>, diag: Option<(Span, &Handler)>)
560474 filtered_float_lit ( Symbol :: intern ( s) ,  suffix,  diag) 
561475} 
562476
563- /// Parses a string representing a byte literal into its final form. Similar to `char_lit`. 
564- fn  byte_lit ( lit :  & str )  -> ( u8 ,  usize )  { 
565-  let  err = |i| format ! ( "lexer accepted invalid byte literal {} step {}" ,  lit,  i) ; 
566- 
567-  if  lit. len ( )  == 1  { 
568-  ( lit. as_bytes ( ) [ 0 ] ,  1 ) 
569-  }  else  { 
570-  assert_eq ! ( lit. as_bytes( ) [ 0 ] ,  b'\\' ,  "{}" ,  err( 0 ) ) ; 
571-  let  b = match  lit. as_bytes ( ) [ 1 ]  { 
572-  b'"'  => b'"' , 
573-  b'n'  => b'\n' , 
574-  b'r'  => b'\r' , 
575-  b't'  => b'\t' , 
576-  b'\\'  => b'\\' , 
577-  b'\''  => b'\'' , 
578-  b'0'  => b'\0' , 
579-  _ => { 
580-  match  u64:: from_str_radix ( & lit[ 2 ..4 ] ,  16 ) . ok ( )  { 
581-  Some ( c)  =>
582-  if  c > 0xFF  { 
583-  panic ! ( err( 2 ) ) 
584-  }  else  { 
585-  return  ( c as  u8 ,  4 ) 
586-  } , 
587-  None  => panic ! ( err( 3 ) ) 
588-  } 
589-  } 
590-  } ; 
591-  ( b,  2 ) 
592-  } 
593- } 
594- 
595- fn  byte_str_lit ( lit :  & str )  -> Lrc < Vec < u8 > >  { 
596-  let  mut  res = Vec :: with_capacity ( lit. len ( ) ) ; 
597- 
598-  let  error = |i| panic ! ( "lexer should have rejected {} at {}" ,  lit,  i) ; 
599- 
600-  /// Eat everything up to a non-whitespace. 
601- fn  eat < I :  Iterator < Item =( usize ,  u8 ) > > ( it :  & mut  iter:: Peekable < I > )  { 
602-  loop  { 
603-  match  it. peek ( ) . map ( |x| x. 1 )  { 
604-  Some ( b' ' )  | Some ( b'\n' )  | Some ( b'\r' )  | Some ( b'\t' )  => { 
605-  it. next ( ) ; 
606-  } , 
607-  _ => {  break ;  } 
608-  } 
609-  } 
610-  } 
611- 
612-  // byte string literals *must* be ASCII, but the escapes don't have to be 
613-  let  mut  chars = lit. bytes ( ) . enumerate ( ) . peekable ( ) ; 
614-  loop  { 
615-  match  chars. next ( )  { 
616-  Some ( ( i,  b'\\' ) )  => { 
617-  match  chars. peek ( ) . unwrap_or_else ( || error ( i) ) . 1  { 
618-  b'\n'  => eat ( & mut  chars) , 
619-  b'\r'  => { 
620-  chars. next ( ) ; 
621-  if  chars. peek ( ) . unwrap_or_else ( || error ( i) ) . 1  != b'\n'  { 
622-  panic ! ( "lexer accepted bare CR" ) ; 
623-  } 
624-  eat ( & mut  chars) ; 
625-  } 
626-  _ => { 
627-  // otherwise, a normal escape 
628-  let  ( c,  n)  = byte_lit ( & lit[ i..] ) ; 
629-  // we don't need to move past the first \ 
630-  for  _ in  0 ..n - 1  { 
631-  chars. next ( ) ; 
632-  } 
633-  res. push ( c) ; 
634-  } 
635-  } 
636-  } , 
637-  Some ( ( i,  b'\r' ) )  => { 
638-  if  chars. peek ( ) . unwrap_or_else ( || error ( i) ) . 1  != b'\n'  { 
639-  panic ! ( "lexer accepted bare CR" ) ; 
640-  } 
641-  chars. next ( ) ; 
642-  res. push ( b'\n' ) ; 
643-  } 
644-  Some ( ( _,  c) )  => res. push ( c) , 
645-  None  => break , 
646-  } 
647-  } 
648- 
649-  Lrc :: new ( res) 
650- } 
651- 
652477fn  integer_lit ( s :  & str ,  suffix :  Option < Symbol > ,  diag :  Option < ( Span ,  & Handler ) > ) 
653478 -> Option < ast:: LitKind >  { 
654479 // s can only be ascii, byte indexing is fine 
0 commit comments