Skip to content

Commit eb24f76

Browse files
committed
Auto merge of #123606 - thomcc:thomcc/opt-lines, r=<try>
Optimize core::str::Lines::count `s.lines().count()+1` is somewhat common as a way to find the line number given a byte position, so it'd be nice if it were faster. This just generalizes the SWAR-optimized char counting code so that it can be used for SWAR-optimized line counting, so it's actually not very complex of a PR. ## TODO - [x] benchmarks - [x] adjust comments - [ ] more tests ## Benchmarks `case00_libcore` is the new version, and `case01_fold_increment` is the previous implementation (the default impl of `Iterator::count()` is a fold that increments ``` str::line_count::all_newlines_32kib::case00_libcore 4.35µs/iter +/- 11.00ns str::line_count::all_newlines_32kib::case01_fold_increment 779.99µs/iter +/- 8.43µs str::line_count::all_newlines_4kib::case00_libcore 562.00ns/iter +/- 5.00ns str::line_count::all_newlines_4kib::case01_fold_increment 97.81µs/iter +/- 1.48µs str::line_count::all_newlines_64b::case00_libcore 21.00ns/iter +/- 0.00ns str::line_count::all_newlines_64b::case01_fold_increment 1.49µs/iter +/- 32.00ns str::line_count::en_huge::case00_libcore 45.58µs/iter +/- 122.00ns str::line_count::en_huge::case01_fold_increment 167.62µs/iter +/- 609.00ns str::line_count::en_large::case00_libcore 734.00ns/iter +/- 6.00ns str::line_count::en_large::case01_fold_increment 2.62µs/iter +/- 9.00ns str::line_count::en_medium::case00_libcore 100.00ns/iter +/- 0.00ns str::line_count::en_medium::case01_fold_increment 347.00ns/iter +/- 0.00ns str::line_count::en_small::case00_libcore 18.00ns/iter +/- 1.00ns str::line_count::en_small::case01_fold_increment 60.00ns/iter +/- 2.00ns str::line_count::en_tiny::case00_libcore 6.00ns/iter +/- 0.00ns str::line_count::en_tiny::case01_fold_increment 60.00ns/iter +/- 0.00ns str::line_count::zh_huge::case00_libcore 40.63µs/iter +/- 85.00ns str::line_count::zh_huge::case01_fold_increment 205.10µs/iter +/- 1.62µs str::line_count::zh_large::case00_libcore 655.00ns/iter +/- 1.00ns str::line_count::zh_large::case01_fold_increment 3.21µs/iter +/- 21.00ns str::line_count::zh_medium::case00_libcore 92.00ns/iter +/- 0.00ns str::line_count::zh_medium::case01_fold_increment 420.00ns/iter +/- 2.00ns str::line_count::zh_small::case00_libcore 20.00ns/iter +/- 1.00ns str::line_count::zh_small::case01_fold_increment 63.00ns/iter +/- 1.00ns str::line_count::zh_tiny::case00_libcore 6.00ns/iter +/- 0.00ns str::line_count::zh_tiny::case01_fold_increment 21.00ns/iter +/- 0.00ns ``` This is a speedup of around 2x-4x most of the time, but for some highly unrealistic scenarios (32KiB of newlines) it's up to almost 200x faster (because the time taken by the version in this PR is not dependent on the number of newlines in the input, but the old version is slower the more newlines are present). It's also much faster for small inputs, especially if they have newlines (10x faster for en_tiny). Real world cases will vary, don't read too much into these, I would expect 2x-4x speedup in general, since that's what it gets on the most realistic examples. Obviously a SIMD impl will beat this, but users who are really bottlenecked on this operation should probably just reach for crates.io (even if we provided a SIMD version, libcore can't use runtime CPU feature detection so they'd still be better off with something from crates.io).
2 parents 3fba278 + ef27373 commit eb24f76

File tree

8 files changed

+152
-32
lines changed

8 files changed

+152
-32
lines changed

library/alloc/tests/str.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1515,6 +1515,7 @@ fn test_lines() {
15151515
fn t(data: &str, expected: &[&str]) {
15161516
let lines: Vec<&str> = data.lines().collect();
15171517
assert_eq!(lines, expected);
1518+
assert_eq!(data.lines().count(), expected.len(), "{:?} vs {:?}", data, expected);
15181519
}
15191520
t("", &[]);
15201521
t("\n", &[""]);

library/core/benches/str.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use test::{black_box, Bencher};
44
mod char_count;
55
mod corpora;
66
mod iter;
7+
mod line_count;
78

89
#[bench]
910
fn str_validate_emoji(b: &mut Bencher) {

library/core/benches/str/char_count.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
use super::corpora::*;
22
use test::{black_box, Bencher};
33

4+
// FIXME: this is partially duplicated in line_count.rs
45
macro_rules! define_benches {
56
($( fn $name: ident($arg: ident: &str) $body: block )+) => {
67
define_benches!(mod en_tiny, en::TINY, $($name $arg $body)+);

library/core/benches/str/corpora.rs

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
//!
1313
//! Except for `mod emoji` (which is just a bunch of emoji), the strings were
1414
//! pulled from (localizations of) rust-lang.org.
15+
//!
16+
//! Newlines are thrown in fairly arbitrarially, as these are used for line
17+
//! counting tests as well.
1518
1619
macro_rules! repeat8 {
1720
($s:expr) => {
@@ -28,8 +31,8 @@ macro_rules! define_consts {
2831
}
2932

3033
pub mod en {
31-
pub const TINY: &str = "Mary had";
32-
pub const SMALL: &str = "Mary had a little lamb, Little lamb";
34+
pub const TINY: &str = "Mary had\n\n";
35+
pub const SMALL: &str = "Mary had a little lamb,\nLittle lamb";
3336
define_consts! {
3437
"Rust is blazingly fast and memory-efficient: with no runtime or garbage
3538
collector, it can power performance-critical services, run on embedded
@@ -45,25 +48,25 @@ pub mod en {
4548

4649
pub mod zh {
4750
pub const TINY: &str = "速度惊";
48-
pub const SMALL: &str = "速度惊人且内存利用率极高";
51+
pub const SMALL: &str = "速度惊人且内\n存利用率极高";
4952
define_consts! {
50-
"Rust 速度惊人且内存利用率极高。由于\
51-
没有运行时和垃圾回收,它能够胜任对性能要\
52-
求特别高的服务,可以在嵌入式设备上运行,\
53-
还能轻松和其他语言集成。Rust 丰富的类型\
54-
系统和所有权模型保证了内存安全和线程安全,\
55-
让您在编译期就能够消除各种各样的错误。\
56-
Rust 拥有出色的文档、友好的编译器和清晰\
57-
的错误提示信息, 还集成了一流的工具——\
58-
包管理器和构建工具, 智能地自动补全和类\
59-
型检验的多编辑器支持, 以及自动格式化代\
53+
"Rust 速度惊人且内存利用率极高。由于\n\
54+
没有运行时和垃圾回收,它能够胜任对性能要\n\
55+
求特别高的服务,可以在嵌入式设备上运行,\n\
56+
还能轻松和其他语言集成。Rust 丰富的类型\n\
57+
系统和所有权模型保证了内存安全和线程安全,\n\
58+
让您在编译期就能够消除各种各样的错误。\n\
59+
Rust 拥有出色的文档、友好的编译器和清晰\n\
60+
的错误提示信息, 还集成了一流的工具——\n\
61+
包管理器和构建工具, 智能地自动补全和类\n\
62+
型检验的多编辑器支持, 以及自动格式化代\n\
6063
码等等。"
6164
}
6265
}
6366

6467
pub mod ru {
6568
pub const TINY: &str = "Сотни";
66-
pub const SMALL: &str = "Сотни компаний по";
69+
pub const SMALL: &str = "Сотникомпаний по";
6770
define_consts! {
6871
"Сотни компаний по всему миру используют Rust в реальных\
6972
проектах для быстрых кросс-платформенных решений с\
@@ -86,3 +89,9 @@ pub mod emoji {
8689
🤚🖐✋🖖👌🤌🤏✌"
8790
}
8891
}
92+
93+
pub mod all_newlines {
94+
pub const SIXTY_FOUR_B: &str = repeat8!("\n\n\n\n\n\n\n\n");
95+
pub const FOUR_KIB: &str = repeat8!(repeat8!(repeat8!("\n\n\n\n\n\n\n\n")));
96+
pub const THIRTY_TWO_KIB: &str = repeat8!(repeat8!(repeat8!(repeat8!("\n\n\n\n\n\n\n\n"))));
97+
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
use super::corpora::*;
2+
use test::{black_box, Bencher};
3+
4+
// FIXME: this is partially duplicated in char_count.rs
5+
macro_rules! define_benches {
6+
($( fn $name: ident($arg: ident: &str) $body: block )+) => {
7+
define_benches!(mod en_tiny, en::TINY, $($name $arg $body)+);
8+
define_benches!(mod en_small, en::SMALL, $($name $arg $body)+);
9+
define_benches!(mod en_medium, en::MEDIUM, $($name $arg $body)+);
10+
define_benches!(mod en_large, en::LARGE, $($name $arg $body)+);
11+
define_benches!(mod en_huge, en::HUGE, $($name $arg $body)+);
12+
13+
define_benches!(mod zh_tiny, zh::TINY, $($name $arg $body)+);
14+
define_benches!(mod zh_small, zh::SMALL, $($name $arg $body)+);
15+
define_benches!(mod zh_medium, zh::MEDIUM, $($name $arg $body)+);
16+
define_benches!(mod zh_large, zh::LARGE, $($name $arg $body)+);
17+
define_benches!(mod zh_huge, zh::HUGE, $($name $arg $body)+);
18+
19+
define_benches!(mod all_newlines_64b, all_newlines::SIXTY_FOUR_B, $($name $arg $body)+);
20+
define_benches!(mod all_newlines_4kib, all_newlines::FOUR_KIB, $($name $arg $body)+);
21+
define_benches!(mod all_newlines_32kib, all_newlines::THIRTY_TWO_KIB, $($name $arg $body)+);
22+
};
23+
(mod $mod_name: ident, $input: expr, $($name: ident $arg: ident $body: block)+) => {
24+
mod $mod_name {
25+
use super::*;
26+
$(
27+
#[bench]
28+
fn $name(bencher: &mut Bencher) {
29+
let input = $input;
30+
bencher.bytes = input.len() as u64;
31+
let mut input_s = input.to_string();
32+
bencher.iter(|| {
33+
let $arg: &str = &black_box(&mut input_s);
34+
black_box($body)
35+
})
36+
}
37+
)+
38+
}
39+
};
40+
}
41+
42+
define_benches! {
43+
fn case00_libcore(s: &str) {
44+
s.lines().count()
45+
}
46+
47+
fn case01_fold_increment(s: &str) {
48+
// same as the default `Iterator::count()` impl.
49+
s.lines().fold(0, |count, _| count + 1)
50+
}
51+
}

library/core/src/str/count.rs

Lines changed: 67 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
//! Code for efficiently counting the number of `char`s in a UTF-8 encoded
2-
//! string.
1+
//! Code for efficiently counting the number of `char`s or lines in a UTF-8
2+
//! encoded string
3+
//!
4+
//! ## `char` count details
35
//!
46
//! Broadly, UTF-8 encodes `char`s as a "leading" byte which begins the `char`,
57
//! followed by some number (possibly 0) of continuation bytes.
@@ -21,21 +23,76 @@ use core::intrinsics::unlikely;
2123

2224
const USIZE_SIZE: usize = core::mem::size_of::<usize>();
2325
const UNROLL_INNER: usize = 4;
26+
const LSB: usize = usize::repeat_u8(0x01);
2427

2528
#[inline]
2629
pub(super) fn count_chars(s: &str) -> usize {
30+
count::<CharCount>(s)
31+
}
32+
33+
#[inline]
34+
pub(super) fn count_lines(s: &str) -> usize {
35+
let newline_count = count::<NewlineCount>(s);
36+
// The logic for going from newline count to line count is a bit weird,
37+
// consider that `"foo\nbar"` is 2 lines, `"foo\nbar\n"` is also 2 lines,
38+
// `"\n"` is one line, and `""` is zero lines.
39+
let ends_with_newline = s.as_bytes().last() == Some(&b'\n');
40+
let is_single_newline = ends_with_newline && s.len() == 1;
41+
let is_special = is_single_newline || s.is_empty();
42+
let adjust_len_by_one = !ends_with_newline && !is_special;
43+
newline_count + adjust_len_by_one as usize
44+
}
45+
46+
trait CountPred {
47+
/// Bytes in `u` which match the pred must be `0x01` in the result, bytes
48+
/// which fail the pred must be `0x00`.
49+
fn test_each_byte_in_word(u: usize) -> usize;
50+
/// Slow path for small inputs.
51+
fn count_general_case(s: &[u8]) -> usize;
52+
}
53+
54+
struct CharCount;
55+
impl CountPred for CharCount {
56+
#[inline]
57+
fn count_general_case(s: &[u8]) -> usize {
58+
char_count_general_case(s)
59+
}
60+
#[inline]
61+
fn test_each_byte_in_word(u: usize) -> usize {
62+
contains_non_continuation_byte(u)
63+
}
64+
}
65+
struct NewlineCount;
66+
impl CountPred for NewlineCount {
67+
#[inline]
68+
fn count_general_case(s: &[u8]) -> usize {
69+
s.iter().filter(|b| **b == b'\n').count()
70+
}
71+
#[inline]
72+
fn test_each_byte_in_word(u: usize) -> usize {
73+
const NEWLINES: usize = usize::repeat_u8(b'\n');
74+
const NOT_MSB: usize = usize::repeat_u8(0x7f);
75+
// bytes of `diff` are nonzero when bytes of `u` don't contain newline
76+
let diff = u ^ NEWLINES;
77+
let res = !(((diff & NOT_MSB).wrapping_add(NOT_MSB) | diff) >> 7);
78+
res & LSB
79+
}
80+
}
81+
82+
#[inline]
83+
fn count<P: CountPred>(s: &str) -> usize {
2784
if s.len() < USIZE_SIZE * UNROLL_INNER {
2885
// Avoid entering the optimized implementation for strings where the
2986
// difference is not likely to matter, or where it might even be slower.
3087
// That said, a ton of thought was not spent on the particular threshold
3188
// here, beyond "this value seems to make sense".
32-
char_count_general_case(s.as_bytes())
89+
P::count_general_case(s.as_bytes())
3390
} else {
34-
do_count_chars(s)
91+
do_count::<P>(s)
3592
}
3693
}
3794

38-
fn do_count_chars(s: &str) -> usize {
95+
fn do_count<P: CountPred>(s: &str) -> usize {
3996
// For correctness, `CHUNK_SIZE` must be:
4097
//
4198
// - Less than or equal to 255, otherwise we'll overflow bytes in `counts`.
@@ -62,13 +119,13 @@ fn do_count_chars(s: &str) -> usize {
62119
// mode).
63120
//
64121
// The `unlikely` helps discourage LLVM from inlining the body, which is
65-
// nice, as we would rather not mark the `char_count_general_case` function
122+
// nice, as we would rather not mark the `P::count_general_case` function
66123
// as cold.
67124
if unlikely(body.is_empty() || head.len() > USIZE_SIZE || tail.len() > USIZE_SIZE) {
68-
return char_count_general_case(s.as_bytes());
125+
return P::count_general_case(s.as_bytes());
69126
}
70127

71-
let mut total = char_count_general_case(head) + char_count_general_case(tail);
128+
let mut total = P::count_general_case(head) + P::count_general_case(tail);
72129
// Split `body` into `CHUNK_SIZE` chunks to reduce the frequency with which
73130
// we call `sum_bytes_in_usize`.
74131
for chunk in body.chunks(CHUNK_SIZE) {
@@ -81,7 +138,7 @@ fn do_count_chars(s: &str) -> usize {
81138
for &word in unrolled {
82139
// Because `CHUNK_SIZE` is < 256, this addition can't cause the
83140
// count in any of the bytes to overflow into a subsequent byte.
84-
counts += contains_non_continuation_byte(word);
141+
counts += P::test_each_byte_in_word(word);
85142
}
86143
}
87144

@@ -97,7 +154,7 @@ fn do_count_chars(s: &str) -> usize {
97154
// Accumulate all the data in the remainder.
98155
let mut counts = 0;
99156
for &word in remainder {
100-
counts += contains_non_continuation_byte(word);
157+
counts += P::test_each_byte_in_word(word);
101158
}
102159
total += sum_bytes_in_usize(counts);
103160
break;
@@ -112,7 +169,6 @@ fn do_count_chars(s: &str) -> usize {
112169
// true)
113170
#[inline]
114171
fn contains_non_continuation_byte(w: usize) -> usize {
115-
const LSB: usize = usize::repeat_u8(0x01);
116172
((!w >> 7) | (w >> 6)) & LSB
117173
}
118174

library/core/src/str/iter.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1174,6 +1174,11 @@ impl<'a> Iterator for Lines<'a> {
11741174
fn last(mut self) -> Option<&'a str> {
11751175
self.next_back()
11761176
}
1177+
1178+
#[inline]
1179+
fn count(self) -> usize {
1180+
self.remainder().map_or(0, super::count::count_lines)
1181+
}
11771182
}
11781183

11791184
#[stable(feature = "rust1", since = "1.0.0")]

tests/ui/std/stdio-from.rs

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,11 @@ use std::env;
55
use std::fs::File;
66
use std::io;
77
use std::io::{Read, Write};
8-
use std::process::{Command, Stdio};
98
use std::path::PathBuf;
9+
use std::process::{Command, Stdio};
1010

1111
fn main() {
12-
if env::args().len() > 1 {
13-
child().unwrap()
14-
} else {
15-
parent().unwrap()
16-
}
12+
if env::args().len() > 1 { child().unwrap() } else { parent().unwrap() }
1713
}
1814

1915
fn parent() -> io::Result<()> {
@@ -55,7 +51,7 @@ fn parent() -> io::Result<()> {
5551
for line in data.lines() {
5652
assert_eq!(line, "foo");
5753
}
58-
assert_eq!(data.lines().count(), 8);
54+
assert_eq!(data.lines().count(), 8, "{:?}", data);
5955
Ok(())
6056
}
6157

0 commit comments

Comments
 (0)