Skip to content

Commit 26b614c

Browse files
committed
Optimize next_code_point and next_code_point_reverse
By inlining the helper functions, we can expose some opportunites for CSE. Also convert the series of nested `if` branches to early return, which IMO makes the code clearer. Comparison of assembly before and after for `next_code_point`: https://godbolt.org/z/bqcvaYxz1 Comparison of assembly before and after for `next_code_point_reverse`: https://godbolt.org/z/n9WM7hPxv
1 parent 1b5227d commit 26b614c

File tree

1 file changed

+70
-73
lines changed

1 file changed

+70
-73
lines changed

library/core/src/str/validations.rs

Lines changed: 70 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,6 @@
33
use super::Utf8Error;
44
use crate::intrinsics::const_eval_select;
55

6-
/// Returns the initial codepoint accumulator for the first byte.
7-
/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
8-
/// for width 3, and 3 bits for width 4.
9-
#[inline]
10-
const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
11-
(byte & (0x7F >> width)) as u32
12-
}
13-
14-
/// Returns the value of `ch` updated with continuation byte `byte`.
15-
#[inline]
16-
const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
17-
(ch << 6) | (byte & CONT_MASK) as u32
18-
}
19-
206
/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
217
/// bits `10`).
228
#[inline]
@@ -33,39 +19,46 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
3319
#[unstable(feature = "str_internals", issue = "none")]
3420
#[inline]
3521
pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
36-
// Decode UTF-8
37-
let x = *bytes.next()?;
38-
if x < 128 {
39-
return Some(x as u32);
22+
let b1 = *bytes.next()? as u32;
23+
if b1 < 0x80 {
24+
// 1 byte (ASCII) case:
25+
// c = b1
26+
return Some(b1);
4027
}
4128

42-
// Multibyte case follows
43-
// Decode from a byte combination out of: [[[x y] z] w]
44-
// NOTE: Performance is sensitive to the exact formulation here
45-
let init = utf8_first_byte(x, 2);
46-
// SAFETY: `bytes` produces an UTF-8-like string,
47-
// so the iterator must produce a value here.
48-
let y = unsafe { *bytes.next().unwrap_unchecked() };
49-
let mut ch = utf8_acc_cont_byte(init, y);
50-
if x >= 0xE0 {
51-
// [[x y z] w] case
52-
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
53-
// SAFETY: `bytes` produces an UTF-8-like string,
54-
// so the iterator must produce a value here.
55-
let z = unsafe { *bytes.next().unwrap_unchecked() };
56-
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
57-
ch = init << 12 | y_z;
58-
if x >= 0xF0 {
59-
// [x y z w] case
60-
// use only the lower 3 bits of `init`
61-
// SAFETY: `bytes` produces an UTF-8-like string,
62-
// so the iterator must produce a value here.
63-
let w = unsafe { *bytes.next().unwrap_unchecked() };
64-
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
65-
}
29+
// SAFETY: `bytes` produces a UTF-8-like string
30+
let b2 = unsafe { *bytes.next().unwrap_unchecked() } as u32;
31+
let c = (b1 & 0x1F) << 6 | (b2 & 0x3F);
32+
if b1 < 0xE0 {
33+
// 2 byte case:
34+
// c = (b1 & 0x1F) << 6
35+
// | (b2 & 0x3F) << 0
36+
return Some(c);
6637
}
6738

68-
Some(ch)
39+
// SAFETY: `bytes` produces a UTF-8-like string
40+
let b3 = unsafe { *bytes.next().unwrap_unchecked() } as u32;
41+
let c = c << 6 | (b3 & 0x3F);
42+
if b1 < 0xF0 {
43+
// 3 byte case:
44+
// c = (b1 & 0x1F) << 12
45+
// | (b2 & 0x3F) << 6
46+
// | (b3 & 0x3F) << 0
47+
return Some(c);
48+
}
49+
50+
// SAFETY: `bytes` produces a UTF-8-like string
51+
let b4 = unsafe { *bytes.next().unwrap_unchecked() } as u32;
52+
let c = c << 6 | (b4 & 0x3F);
53+
// 4 byte case:
54+
// c = ((b1 & 0x1F) << 18
55+
// | (b2 & 0x3F) << 12
56+
// | (b3 & 0x3F) << 6
57+
// | (b4 & 0x3F) << 0) & 0x3F_FF_FF
58+
// Masking by 0x1F_FF_FF would be sufficient (since we only want the 21 lowest bits),
59+
// but masking by 0x3F_FF_FF lets x86 use a movzx instead of an and,
60+
// which has a shorter encoding.
61+
Some(c & 0x3F_FF_FF)
6962
}
7063

7164
/// Reads the last code point out of a byte iterator (assuming a
@@ -80,36 +73,43 @@ pub unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
8073
where
8174
I: DoubleEndedIterator<Item = &'a u8>,
8275
{
83-
// Decode UTF-8
84-
let w = match *bytes.next_back()? {
85-
next_byte if next_byte < 128 => return Some(next_byte as u32),
86-
back_byte => back_byte,
87-
};
76+
let b1 = *bytes.next_back()?;
77+
if b1 < 0x80 {
78+
// 1 byte (ASCII) case:
79+
// c = b1
80+
return Some(b1 as u32);
81+
}
8882

89-
// Multibyte case follows
90-
// Decode from a byte combination out of: [x [y [z w]]]
91-
let mut ch;
92-
// SAFETY: `bytes` produces an UTF-8-like string,
93-
// so the iterator must produce a value here.
94-
let z = unsafe { *bytes.next_back().unwrap_unchecked() };
95-
ch = utf8_first_byte(z, 2);
96-
if utf8_is_cont_byte(z) {
97-
// SAFETY: `bytes` produces an UTF-8-like string,
98-
// so the iterator must produce a value here.
99-
let y = unsafe { *bytes.next_back().unwrap_unchecked() };
100-
ch = utf8_first_byte(y, 3);
101-
if utf8_is_cont_byte(y) {
102-
// SAFETY: `bytes` produces an UTF-8-like string,
103-
// so the iterator must produce a value here.
104-
let x = unsafe { *bytes.next_back().unwrap_unchecked() };
105-
ch = utf8_first_byte(x, 4);
106-
ch = utf8_acc_cont_byte(ch, y);
107-
}
108-
ch = utf8_acc_cont_byte(ch, z);
83+
// SAFETY: `bytes` produces a UTF-8-like string
84+
let b2 = unsafe { *bytes.next_back().unwrap_unchecked() };
85+
let c = u32::from(b1 & 0x3F) | u32::from(b2 & 0x3F) << 6;
86+
if !utf8_is_cont_byte(b2) {
87+
// 2 byte case:
88+
// c = (b2 & 0x3F) << 6
89+
// | (b1 & 0x3F) << 0
90+
return Some(c);
10991
}
110-
ch = utf8_acc_cont_byte(ch, w);
11192

112-
Some(ch)
93+
// SAFETY: `bytes` produces a UTF-8-like string
94+
let b3 = unsafe { *bytes.next_back().unwrap_unchecked() };
95+
let c = c | u32::from(b3 & 0x3F) << 12;
96+
if !utf8_is_cont_byte(b3) {
97+
// 3 byte case:
98+
// c = ((b3 & 0x3F) << 12
99+
// | (b2 & 0x3F) << 6
100+
// | (b1 & 0x3F) << 0) & 0xFF_FF
101+
return Some(c & 0xFF_FF);
102+
}
103+
104+
// SAFETY: `bytes` produces a UTF-8-like string
105+
let b4 = unsafe { *bytes.next_back().unwrap_unchecked() };
106+
let c = c | u32::from(b4 & 0x07) << 18;
107+
// 4 byte case:
108+
// c = (b4 & 0x07) << 18
109+
// | (b3 & 0x3F) << 12
110+
// | (b2 & 0x3F) << 6
111+
// | (b1 & 0x3F) << 0
112+
Some(c)
113113
}
114114

115115
const NONASCII_MASK: usize = usize::repeat_u8(0x80);
@@ -279,6 +279,3 @@ const UTF8_CHAR_WIDTH: &[u8; 256] = &[
279279
pub const fn utf8_char_width(b: u8) -> usize {
280280
UTF8_CHAR_WIDTH[b as usize] as usize
281281
}
282-
283-
/// Mask of the value bits of a continuation byte.
284-
const CONT_MASK: u8 = 0b0011_1111;

0 commit comments

Comments
 (0)