11//! Operations related to UTF-8 validation.
22
33use super :: Utf8Error ;
4+ use crate :: hint:: assert_unchecked;
45use crate :: intrinsics:: const_eval_select;
56
6- /// Returns the initial codepoint accumulator for the first byte.
7- /// The first byte is special, only want bottom 5 bits for width 2, 4 bits
8- /// for width 3, and 3 bits for width 4.
9- #[ inline]
10- const fn utf8_first_byte ( byte : u8 , width : u32 ) -> u32 {
11- ( byte & ( 0x7F >> width) ) as u32
12- }
13-
14- /// Returns the value of `ch` updated with continuation byte `byte`.
15- #[ inline]
16- const fn utf8_acc_cont_byte ( ch : u32 , byte : u8 ) -> u32 {
17- ( ch << 6 ) | ( byte & CONT_MASK ) as u32
18- }
19-
207/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
218/// bits `10`).
229#[ inline]
@@ -33,39 +20,49 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
3320#[ unstable( feature = "str_internals" , issue = "none" ) ]
3421#[ inline]
3522pub unsafe fn next_code_point < ' a , I : Iterator < Item = & ' a u8 > > ( bytes : & mut I ) -> Option < u32 > {
36- // Decode UTF-8
37- let x = * bytes. next ( ) ?;
38- if x < 128 {
39- return Some ( x as u32 ) ;
23+ let b1 = * bytes. next ( ) ?;
24+ if b1 < 0x80 {
25+ // 1 byte case (U+0000 ..= U+007F):
26+ // c = b1
27+ return Some ( u32:: from ( b1) ) ;
4028 }
4129
42- // Multibyte case follows
43- // Decode from a byte combination out of: [[[x y] z] w]
44- // NOTE: Performance is sensitive to the exact formulation here
45- let init = utf8_first_byte ( x, 2 ) ;
46- // SAFETY: `bytes` produces an UTF-8-like string,
47- // so the iterator must produce a value here.
48- let y = unsafe { * bytes. next ( ) . unwrap_unchecked ( ) } ;
49- let mut ch = utf8_acc_cont_byte ( init, y) ;
50- if x >= 0xE0 {
51- // [[x y z] w] case
52- // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
53- // SAFETY: `bytes` produces an UTF-8-like string,
54- // so the iterator must produce a value here.
55- let z = unsafe { * bytes. next ( ) . unwrap_unchecked ( ) } ;
56- let y_z = utf8_acc_cont_byte ( ( y & CONT_MASK ) as u32 , z) ;
57- ch = init << 12 | y_z;
58- if x >= 0xF0 {
59- // [x y z w] case
60- // use only the lower 3 bits of `init`
61- // SAFETY: `bytes` produces an UTF-8-like string,
62- // so the iterator must produce a value here.
63- let w = unsafe { * bytes. next ( ) . unwrap_unchecked ( ) } ;
64- ch = ( init & 7 ) << 18 | utf8_acc_cont_byte ( y_z, w) ;
65- }
30+ // SAFETY: `bytes` produces a UTF-8-like string
31+ let mut next_byte = || unsafe {
32+ let b = * bytes. next ( ) . unwrap_unchecked ( ) ;
33+ assert_unchecked ( utf8_is_cont_byte ( b) ) ;
34+ b
35+ } ;
36+ let combine = |c : u32 , byte : u8 | c << 6 | u32:: from ( byte & CONT_MASK ) ;
37+
38+ let b2 = next_byte ( ) ;
39+ let c = u32:: from ( b1 & 0x1F ) ;
40+ let c = combine ( c, b2) ;
41+ if b1 < 0xE0 {
42+ // 2 byte case (U+0080 ..= U+07FF):
43+ // c = (b1 & 0x1F) << 6
44+ // | (b2 & 0x3F) << 0
45+ return Some ( c) ;
46+ }
47+
48+ let b3 = next_byte ( ) ;
49+ let c = combine ( c, b3) ;
50+ if b1 < 0xF0 {
51+ // 3 byte case (U+0800 ..= U+FFFF):
52+ // c = (b1 & 0x1F) << 12
53+ // | (b2 & 0x3F) << 6
54+ // | (b3 & 0x3F) << 0
55+ return Some ( c) ;
6656 }
6757
68- Some ( ch)
58+ let b4 = next_byte ( ) ;
59+ let c = combine ( c, b4) ;
60+ // 4 byte case (U+01_0000 ..= U+10_FFFF):
61+ // c = ((b1 & 0x1F) << 18
62+ // | (b2 & 0x3F) << 12
63+ // | (b3 & 0x3F) << 6
64+ // | (b4 & 0x3F) << 0) & 0x1F_FFFF
65+ Some ( c & 0x1F_FFFF )
6966}
7067
7168/// Reads the last code point out of a byte iterator (assuming a
@@ -80,36 +77,49 @@ pub unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
8077where
8178 I : DoubleEndedIterator < Item = & ' a u8 > ,
8279{
83- // Decode UTF-8
84- let w = match * bytes. next_back ( ) ? {
85- next_byte if next_byte < 128 => return Some ( next_byte as u32 ) ,
86- back_byte => back_byte,
80+ let b1 = * bytes. next_back ( ) ?;
81+ if b1 < 0x80 {
82+ // 1 byte case (U+0000 ..= U+007F):
83+ // c = b1
84+ return Some ( u32:: from ( b1) ) ;
85+ }
86+
87+ // SAFETY: `bytes` produces a UTF-8-like string
88+ let mut next_byte = || unsafe {
89+ let b = * bytes. next_back ( ) . unwrap_unchecked ( ) ;
90+ assert_unchecked ( !b. is_ascii ( ) ) ;
91+ b
8792 } ;
93+ let combine = |c : u32 , byte : u8 , shift| c | u32:: from ( byte & CONT_MASK ) << shift;
8894
89- // Multibyte case follows
90- // Decode from a byte combination out of: [x [y [z w]]]
91- let mut ch;
92- // SAFETY: `bytes` produces an UTF-8-like string,
93- // so the iterator must produce a value here.
94- let z = unsafe { * bytes. next_back ( ) . unwrap_unchecked ( ) } ;
95- ch = utf8_first_byte ( z, 2 ) ;
96- if utf8_is_cont_byte ( z) {
97- // SAFETY: `bytes` produces an UTF-8-like string,
98- // so the iterator must produce a value here.
99- let y = unsafe { * bytes. next_back ( ) . unwrap_unchecked ( ) } ;
100- ch = utf8_first_byte ( y, 3 ) ;
101- if utf8_is_cont_byte ( y) {
102- // SAFETY: `bytes` produces an UTF-8-like string,
103- // so the iterator must produce a value here.
104- let x = unsafe { * bytes. next_back ( ) . unwrap_unchecked ( ) } ;
105- ch = utf8_first_byte ( x, 4 ) ;
106- ch = utf8_acc_cont_byte ( ch, y) ;
107- }
108- ch = utf8_acc_cont_byte ( ch, z) ;
95+ let b2 = next_byte ( ) ;
96+ let c = u32:: from ( b1 & CONT_MASK ) ;
97+ let c = combine ( c, b2, 6 ) ;
98+ if !utf8_is_cont_byte ( b2) {
99+ // 2 byte case (U+0080 ..= U+07FF):
100+ // c = (b2 & 0x3F) << 6
101+ // | (b1 & 0x3F) << 0
102+ return Some ( c) ;
103+ }
104+
105+ let b3 = next_byte ( ) ;
106+ let c = combine ( c, b3, 12 ) ;
107+ if !utf8_is_cont_byte ( b3) {
108+ // 3 byte case (U+0800 ..= U+FFFF):
109+ // c = ((b3 & 0x3F) << 12
110+ // | (b2 & 0x3F) << 6
111+ // | (b1 & 0x3F) << 0) & 0xFFFF
112+ return Some ( c & 0xFFFF ) ;
109113 }
110- ch = utf8_acc_cont_byte ( ch, w) ;
111114
112- Some ( ch)
115+ let b4 = next_byte ( ) ;
116+ let c = combine ( c, b4, 18 ) ;
117+ // 4 byte case (U+01_0000 ..= U+10_FFFF):
118+ // c = ((b4 & 0x3F) << 18
119+ // | (b3 & 0x3F) << 12
120+ // | (b2 & 0x3F) << 6
121+ // | (b1 & 0x3F) << 0) & 0x1F_FFFF
122+ Some ( c & 0x1F_FFFF )
113123}
114124
115125const NONASCII_MASK : usize = usize:: repeat_u8 ( 0x80 ) ;
@@ -280,5 +290,5 @@ pub const fn utf8_char_width(b: u8) -> usize {
280290 UTF8_CHAR_WIDTH [ b as usize ] as usize
281291}
282292
283- /// Mask of the value bits of a continuation byte.
293+ /// Mask of the value bits of a continuation byte (ie the lowest 6 bits) .
284294const CONT_MASK : u8 = 0b0011_1111 ;
0 commit comments