Skip to content

Commit 43c4909

Browse files
committed
Optimize next_code_point and next_code_point_reverse
By reordering some operations, we can expose some opportunites for CSE. Also convert the series of nested `if` branches to early return, which IMO makes the code clearer. Comparison of assembly before and after for `next_code_point`: https://godbolt.org/z/9Te84YzhK Comparison of assembly before and after for `next_code_point_reverse`: https://godbolt.org/z/fTx1a7oz1
1 parent 2c4b068 commit 43c4909

File tree

1 file changed

+80
-70
lines changed

1 file changed

+80
-70
lines changed

library/core/src/str/validations.rs

Lines changed: 80 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,9 @@
11
//! Operations related to UTF-8 validation.
22
33
use super::Utf8Error;
4+
use crate::hint::assert_unchecked;
45
use crate::intrinsics::const_eval_select;
56

6-
/// Returns the initial codepoint accumulator for the first byte.
7-
/// The first byte is special, only want bottom 5 bits for width 2, 4 bits
8-
/// for width 3, and 3 bits for width 4.
9-
#[inline]
10-
const fn utf8_first_byte(byte: u8, width: u32) -> u32 {
11-
(byte & (0x7F >> width)) as u32
12-
}
13-
14-
/// Returns the value of `ch` updated with continuation byte `byte`.
15-
#[inline]
16-
const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
17-
(ch << 6) | (byte & CONT_MASK) as u32
18-
}
19-
207
/// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the
218
/// bits `10`).
229
#[inline]
@@ -33,39 +20,49 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
3320
#[unstable(feature = "str_internals", issue = "none")]
3421
#[inline]
3522
pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
36-
// Decode UTF-8
37-
let x = *bytes.next()?;
38-
if x < 128 {
39-
return Some(x as u32);
23+
let b1 = *bytes.next()?;
24+
if b1 < 0x80 {
25+
// 1 byte case (U+0000 ..= U+007F):
26+
// c = b1
27+
return Some(u32::from(b1));
4028
}
4129

42-
// Multibyte case follows
43-
// Decode from a byte combination out of: [[[x y] z] w]
44-
// NOTE: Performance is sensitive to the exact formulation here
45-
let init = utf8_first_byte(x, 2);
46-
// SAFETY: `bytes` produces an UTF-8-like string,
47-
// so the iterator must produce a value here.
48-
let y = unsafe { *bytes.next().unwrap_unchecked() };
49-
let mut ch = utf8_acc_cont_byte(init, y);
50-
if x >= 0xE0 {
51-
// [[x y z] w] case
52-
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
53-
// SAFETY: `bytes` produces an UTF-8-like string,
54-
// so the iterator must produce a value here.
55-
let z = unsafe { *bytes.next().unwrap_unchecked() };
56-
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
57-
ch = init << 12 | y_z;
58-
if x >= 0xF0 {
59-
// [x y z w] case
60-
// use only the lower 3 bits of `init`
61-
// SAFETY: `bytes` produces an UTF-8-like string,
62-
// so the iterator must produce a value here.
63-
let w = unsafe { *bytes.next().unwrap_unchecked() };
64-
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
65-
}
30+
// SAFETY: `bytes` produces a UTF-8-like string
31+
let mut next_byte = || unsafe {
32+
let b = *bytes.next().unwrap_unchecked();
33+
assert_unchecked(utf8_is_cont_byte(b));
34+
b
35+
};
36+
let combine = |c: u32, byte: u8| c << 6 | u32::from(byte & CONT_MASK);
37+
38+
let b2 = next_byte();
39+
let c = u32::from(b1 & 0x1F);
40+
let c = combine(c, b2);
41+
if b1 < 0xE0 {
42+
// 2 byte case (U+0080 ..= U+07FF):
43+
// c = (b1 & 0x1F) << 6
44+
// | (b2 & 0x3F) << 0
45+
return Some(c);
46+
}
47+
48+
let b3 = next_byte();
49+
let c = combine(c, b3);
50+
if b1 < 0xF0 {
51+
// 3 byte case (U+0800 ..= U+FFFF):
52+
// c = (b1 & 0x1F) << 12
53+
// | (b2 & 0x3F) << 6
54+
// | (b3 & 0x3F) << 0
55+
return Some(c);
6656
}
6757

68-
Some(ch)
58+
let b4 = next_byte();
59+
let c = combine(c, b4);
60+
// 4 byte case (U+01_0000 ..= U+10_FFFF):
61+
// c = ((b1 & 0x1F) << 18
62+
// | (b2 & 0x3F) << 12
63+
// | (b3 & 0x3F) << 6
64+
// | (b4 & 0x3F) << 0) & 0x1F_FFFF
65+
Some(c & 0x1F_FFFF)
6966
}
7067

7168
/// Reads the last code point out of a byte iterator (assuming a
@@ -80,36 +77,49 @@ pub unsafe fn next_code_point_reverse<'a, I>(bytes: &mut I) -> Option<u32>
8077
where
8178
I: DoubleEndedIterator<Item = &'a u8>,
8279
{
83-
// Decode UTF-8
84-
let w = match *bytes.next_back()? {
85-
next_byte if next_byte < 128 => return Some(next_byte as u32),
86-
back_byte => back_byte,
80+
let b1 = *bytes.next_back()?;
81+
if b1 < 0x80 {
82+
// 1 byte case (U+0000 ..= U+007F):
83+
// c = b1
84+
return Some(u32::from(b1));
85+
}
86+
87+
// SAFETY: `bytes` produces a UTF-8-like string
88+
let mut next_byte = || unsafe {
89+
let b = *bytes.next_back().unwrap_unchecked();
90+
assert_unchecked(!b.is_ascii());
91+
b
8792
};
93+
let combine = |c: u32, byte: u8, shift| c | u32::from(byte & CONT_MASK) << shift;
8894

89-
// Multibyte case follows
90-
// Decode from a byte combination out of: [x [y [z w]]]
91-
let mut ch;
92-
// SAFETY: `bytes` produces an UTF-8-like string,
93-
// so the iterator must produce a value here.
94-
let z = unsafe { *bytes.next_back().unwrap_unchecked() };
95-
ch = utf8_first_byte(z, 2);
96-
if utf8_is_cont_byte(z) {
97-
// SAFETY: `bytes` produces an UTF-8-like string,
98-
// so the iterator must produce a value here.
99-
let y = unsafe { *bytes.next_back().unwrap_unchecked() };
100-
ch = utf8_first_byte(y, 3);
101-
if utf8_is_cont_byte(y) {
102-
// SAFETY: `bytes` produces an UTF-8-like string,
103-
// so the iterator must produce a value here.
104-
let x = unsafe { *bytes.next_back().unwrap_unchecked() };
105-
ch = utf8_first_byte(x, 4);
106-
ch = utf8_acc_cont_byte(ch, y);
107-
}
108-
ch = utf8_acc_cont_byte(ch, z);
95+
let b2 = next_byte();
96+
let c = u32::from(b1 & CONT_MASK);
97+
let c = combine(c, b2, 6);
98+
if !utf8_is_cont_byte(b2) {
99+
// 2 byte case (U+0080 ..= U+07FF):
100+
// c = (b2 & 0x3F) << 6
101+
// | (b1 & 0x3F) << 0
102+
return Some(c);
103+
}
104+
105+
let b3 = next_byte();
106+
let c = combine(c, b3, 12);
107+
if !utf8_is_cont_byte(b3) {
108+
// 3 byte case (U+0800 ..= U+FFFF):
109+
// c = ((b3 & 0x3F) << 12
110+
// | (b2 & 0x3F) << 6
111+
// | (b1 & 0x3F) << 0) & 0xFFFF
112+
return Some(c & 0xFFFF);
109113
}
110-
ch = utf8_acc_cont_byte(ch, w);
111114

112-
Some(ch)
115+
let b4 = next_byte();
116+
let c = combine(c, b4, 18);
117+
// 4 byte case (U+01_0000 ..= U+10_FFFF):
118+
// c = ((b4 & 0x3F) << 18
119+
// | (b3 & 0x3F) << 12
120+
// | (b2 & 0x3F) << 6
121+
// | (b1 & 0x3F) << 0) & 0x1F_FFFF
122+
Some(c & 0x1F_FFFF)
113123
}
114124

115125
const NONASCII_MASK: usize = usize::repeat_u8(0x80);
@@ -280,5 +290,5 @@ pub const fn utf8_char_width(b: u8) -> usize {
280290
UTF8_CHAR_WIDTH[b as usize] as usize
281291
}
282292

283-
/// Mask of the value bits of a continuation byte.
293+
/// Mask of the value bits of a continuation byte (ie the lowest 6 bits).
284294
const CONT_MASK: u8 = 0b0011_1111;

0 commit comments

Comments
 (0)