33use super :: Utf8Error ;
44use crate :: intrinsics:: const_eval_select;
55
6- /// Returns the initial codepoint accumulator for the first byte.
7- /// The first byte is special, only want bottom 5 bits for width 2, 4 bits
8- /// for width 3, and 3 bits for width 4.
9- #[ inline]
10- const fn utf8_first_byte ( byte : u8 , width : u32 ) -> u32 {
11- ( byte & ( 0x7F >> width) ) as u32
12- }
13-
14- /// Returns the value of `ch` updated with continuation byte `byte`.
15- #[ inline]
16- const fn utf8_acc_cont_byte ( ch : u32 , byte : u8 ) -> u32 {
17- ( ch << 6 ) | ( byte & CONT_MASK ) as u32
18- }
19-
206/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
217/// bits `10`).
228#[ inline]
@@ -33,39 +19,46 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
3319#[ unstable( feature = "str_internals" , issue = "none" ) ]
3420#[ inline]
3521pub unsafe fn next_code_point < ' a , I : Iterator < Item = & ' a u8 > > ( bytes : & mut I ) -> Option < u32 > {
36- // Decode UTF-8
37- let x = * bytes. next ( ) ?;
38- if x < 128 {
39- return Some ( x as u32 ) ;
22+ let b1 = * bytes. next ( ) ? as u32 ;
23+ if b1 < 0x80 {
24+ // 1 byte (ASCII) case:
25+ // c = b1
26+ return Some ( b1) ;
4027 }
4128
42- // Multibyte case follows
43- // Decode from a byte combination out of: [[[x y] z] w]
44- // NOTE: Performance is sensitive to the exact formulation here
45- let init = utf8_first_byte ( x, 2 ) ;
46- // SAFETY: `bytes` produces an UTF-8-like string,
47- // so the iterator must produce a value here.
48- let y = unsafe { * bytes. next ( ) . unwrap_unchecked ( ) } ;
49- let mut ch = utf8_acc_cont_byte ( init, y) ;
50- if x >= 0xE0 {
51- // [[x y z] w] case
52- // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
53- // SAFETY: `bytes` produces an UTF-8-like string,
54- // so the iterator must produce a value here.
55- let z = unsafe { * bytes. next ( ) . unwrap_unchecked ( ) } ;
56- let y_z = utf8_acc_cont_byte ( ( y & CONT_MASK ) as u32 , z) ;
57- ch = init << 12 | y_z;
58- if x >= 0xF0 {
59- // [x y z w] case
60- // use only the lower 3 bits of `init`
61- // SAFETY: `bytes` produces an UTF-8-like string,
62- // so the iterator must produce a value here.
63- let w = unsafe { * bytes. next ( ) . unwrap_unchecked ( ) } ;
64- ch = ( init & 7 ) << 18 | utf8_acc_cont_byte ( y_z, w) ;
65- }
29+ // SAFETY: `bytes` produces a UTF-8-like string
30+ let b2 = unsafe { * bytes. next ( ) . unwrap_unchecked ( ) } as u32 ;
31+ let c = ( b1 & 0x1F ) << 6 | ( b2 & 0x3F ) ;
32+ if b1 < 0xE0 {
33+ // 2 byte case:
34+ // c = (b1 & 0x1F) << 6
35+ // | (b2 & 0x3F) << 0
36+ return Some ( c) ;
6637 }
6738
68- Some ( ch)
39+ // SAFETY: `bytes` produces a UTF-8-like string
40+ let b3 = unsafe { * bytes. next ( ) . unwrap_unchecked ( ) } as u32 ;
41+ let c = c << 6 | ( b3 & 0x3F ) ;
42+ if b1 < 0xF0 {
43+ // 3 byte case:
44+ // c = (b1 & 0x1F) << 12
45+ // | (b2 & 0x3F) << 6
46+ // | (b3 & 0x3F) << 0
47+ return Some ( c) ;
48+ }
49+
50+ // SAFETY: `bytes` produces a UTF-8-like string
51+ let b4 = unsafe { * bytes. next ( ) . unwrap_unchecked ( ) } as u32 ;
52+ let c = c << 6 | ( b4 & 0x3F ) ;
53+ // 4 byte case:
54+ // c = ((b1 & 0x1F) << 18
55+ // | (b2 & 0x3F) << 12
56+ // | (b3 & 0x3F) << 6
57+ // | (b4 & 0x3F) << 0) & 0x3F_FF_FF
58+ // Masking by 0x1F_FF_FF would be sufficient (since we only want the 21 lowest bits),
59+ // but masking by 0x3F_FF_FF lets x86 use a movzx instead of an and,
60+ // which has a shorter encoding.
61+ Some ( c & 0x3F_FF_FF )
6962}
7063
7164/// Reads the last code point out of a byte iterator (assuming a
@@ -80,36 +73,43 @@ pub unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
8073where
8174 I : DoubleEndedIterator < Item = & ' a u8 > ,
8275{
83- // Decode UTF-8
84- let w = match * bytes. next_back ( ) ? {
85- next_byte if next_byte < 128 => return Some ( next_byte as u32 ) ,
86- back_byte => back_byte,
87- } ;
76+ let b1 = * bytes. next_back ( ) ?;
77+ if b1 < 0x80 {
78+ // 1 byte (ASCII) case:
79+ // c = b1
80+ return Some ( b1 as u32 ) ;
81+ }
8882
89- // Multibyte case follows
90- // Decode from a byte combination out of: [x [y [z w]]]
91- let mut ch;
92- // SAFETY: `bytes` produces an UTF-8-like string,
93- // so the iterator must produce a value here.
94- let z = unsafe { * bytes. next_back ( ) . unwrap_unchecked ( ) } ;
95- ch = utf8_first_byte ( z, 2 ) ;
96- if utf8_is_cont_byte ( z) {
97- // SAFETY: `bytes` produces an UTF-8-like string,
98- // so the iterator must produce a value here.
99- let y = unsafe { * bytes. next_back ( ) . unwrap_unchecked ( ) } ;
100- ch = utf8_first_byte ( y, 3 ) ;
101- if utf8_is_cont_byte ( y) {
102- // SAFETY: `bytes` produces an UTF-8-like string,
103- // so the iterator must produce a value here.
104- let x = unsafe { * bytes. next_back ( ) . unwrap_unchecked ( ) } ;
105- ch = utf8_first_byte ( x, 4 ) ;
106- ch = utf8_acc_cont_byte ( ch, y) ;
107- }
108- ch = utf8_acc_cont_byte ( ch, z) ;
83+ // SAFETY: `bytes` produces a UTF-8-like string
84+ let b2 = unsafe { * bytes. next_back ( ) . unwrap_unchecked ( ) } ;
85+ let c = u32:: from ( b1 & 0x3F ) | u32:: from ( b2 & 0x3F ) << 6 ;
86+ if !utf8_is_cont_byte ( b2) {
87+ // 2 byte case:
88+ // c = (b2 & 0x3F) << 6
89+ // | (b1 & 0x3F) << 0
90+ return Some ( c) ;
10991 }
110- ch = utf8_acc_cont_byte ( ch, w) ;
11192
112- Some ( ch)
93+ // SAFETY: `bytes` produces a UTF-8-like string
94+ let b3 = unsafe { * bytes. next_back ( ) . unwrap_unchecked ( ) } ;
95+ let c = c | u32:: from ( b3 & 0x3F ) << 12 ;
96+ if !utf8_is_cont_byte ( b3) {
97+ // 3 byte case:
98+ // c = ((b3 & 0x3F) << 12
99+ // | (b2 & 0x3F) << 6
100+ // | (b1 & 0x3F) << 0) & 0xFF_FF
101+ return Some ( c & 0xFF_FF ) ;
102+ }
103+
104+ // SAFETY: `bytes` produces a UTF-8-like string
105+ let b4 = unsafe { * bytes. next_back ( ) . unwrap_unchecked ( ) } ;
106+ let c = c | u32:: from ( b4 & 0x07 ) << 18 ;
107+ // 4 byte case:
108+ // c = (b4 & 0x07) << 18
109+ // | (b3 & 0x3F) << 12
110+ // | (b2 & 0x3F) << 6
111+ // | (b1 & 0x3F) << 0
112+ Some ( c)
113113}
114114
115115const NONASCII_MASK : usize = usize:: repeat_u8 ( 0x80 ) ;
@@ -279,6 +279,3 @@ const UTF8_CHAR_WIDTH: &[u8; 256] = &[
279279pub const fn utf8_char_width ( b : u8 ) -> usize {
280280 UTF8_CHAR_WIDTH [ b as usize ] as usize
281281}
282-
283- /// Mask of the value bits of a continuation byte.
284- const CONT_MASK : u8 = 0b0011_1111 ;
0 commit comments