rust-lang
diff --git a/‎library/alloc/tests/str.rs‎
Lines changed: 1 addition & 0 deletions b/‎library/alloc/tests/str.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎library/core/benches/str.rs‎
Lines changed: 1 addition & 0 deletions b/‎library/core/benches/str.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎library/core/benches/str/char_count.rs‎
Lines changed: 1 addition & 0 deletions b/‎library/core/benches/str/char_count.rs‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎library/core/benches/str/corpora.rs‎
Lines changed: 23 additions & 14 deletions b/‎library/core/benches/str/corpora.rs‎
Lines changed: 23 additions & 14 deletions
diff --git a/‎library/core/benches/str/line_count.rs‎
Lines changed: 51 additions & 0 deletions b/‎library/core/benches/str/line_count.rs‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎library/core/src/str/count.rs‎
Lines changed: 67 additions & 11 deletions b/‎library/core/src/str/count.rs‎
Lines changed: 67 additions & 11 deletions
diff --git a/‎library/core/src/str/iter.rs‎
Lines changed: 5 additions & 0 deletions b/‎library/core/src/str/iter.rs‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tests/ui/std/stdio-from.rs‎
Lines changed: 3 additions & 7 deletions b/‎tests/ui/std/stdio-from.rs‎
Lines changed: 3 additions & 7 deletions
@@ -1515,6 +1515,7 @@ fn test_lines() {
  fn t(data: &str, expected: &[&str]) {
  let lines: Vec<&str> = data.lines().collect();
  assert_eq!(lines, expected);
+ assert_eq!(data.lines().count(), expected.len(), "{:?} vs {:?}", data, expected);
  }
  t("", &[]);
  t("\n", &[""]);
 
@@ -4,6 +4,7 @@ use test::{black_box, Bencher};
 mod char_count;
 mod corpora;
 mod iter;
+mod line_count;
 
 #[bench]
 fn str_validate_emoji(b: &mut Bencher) {
 
@@ -1,6 +1,7 @@
 use super::corpora::*;
 use test::{black_box, Bencher};
 
+// FIXME: this is partially duplicated in line_count.rs
 macro_rules! define_benches {
  ($( fn $name: ident($arg: ident: &str) $body: block )+) => {
  define_benches!(mod en_tiny, en::TINY, $($name $arg $body)+);
 
@@ -12,6 +12,9 @@
 //!
 //! Except for `mod emoji` (which is just a bunch of emoji), the strings were
 //! pulled from (localizations of) rust-lang.org.
+//!
+//! Newlines are thrown in fairly arbitrarially, as these are used for line
+//! counting tests as well.
 
 macro_rules! repeat8 {
  ($s:expr) => {
@@ -28,8 +31,8 @@ macro_rules! define_consts {
 }
 
 pub mod en {
- pub const TINY: &str = "Mary had";
- pub const SMALL: &str = "Mary had a little lamb, Little lamb";
+ pub const TINY: &str = "Mary had\n\n";
+ pub const SMALL: &str = "Mary had a little lamb,\nLittle lamb";
  define_consts! {
  "Rust is blazingly fast and memory-efficient: with no runtime or garbage
  collector, it can power performance-critical services, run on embedded
@@ -45,25 +48,25 @@ pub mod en {
 
 pub mod zh {
  pub const TINY: &str = "速度惊";
- pub const SMALL: &str = "速度惊人且内存利用率极高";
+ pub const SMALL: &str = "速度惊人且内\n存利用率极高";
  define_consts! {
- "Rust 速度惊人且内存利用率极高。由于\
- 没有运行时和垃圾回收，它能够胜任对性能要\
- 求特别高的服务，可以在嵌入式设备上运行，\
- 还能轻松和其他语言集成。Rust 丰富的类型\
- 系统和所有权模型保证了内存安全和线程安全，\
- 让您在编译期就能够消除各种各样的错误。\
- Rust 拥有出色的文档、友好的编译器和清晰\
- 的错误提示信息， 还集成了一流的工具——\
- 包管理器和构建工具， 智能地自动补全和类\
- 型检验的多编辑器支持， 以及自动格式化代\
+ "Rust 速度惊人且内存利用率极高。由于\n\
+ 没有运行时和垃圾回收，它能够胜任对性能要\n\
+ 求特别高的服务，可以在嵌入式设备上运行，\n\
+ 还能轻松和其他语言集成。Rust 丰富的类型\n\
+ 系统和所有权模型保证了内存安全和线程安全，\n\
+ 让您在编译期就能够消除各种各样的错误。\n\
+ Rust 拥有出色的文档、友好的编译器和清晰\n\
+ 的错误提示信息， 还集成了一流的工具——\n\
+ 包管理器和构建工具， 智能地自动补全和类\n\
+ 型检验的多编辑器支持， 以及自动格式化代\n\
  码等等。"
  }
 }
 
 pub mod ru {
  pub const TINY: &str = "Сотни";
- pub const SMALL: &str = "Сотни компаний по";
+ pub const SMALL: &str = "Сотникомпаний по";
  define_consts! {
  "Сотни компаний по всему миру используют Rust в реальных\
  проектах для быстрых кросс-платформенных решений с\
@@ -86,3 +89,9 @@ pub mod emoji {
  🤚🖐✋🖖👌🤌🤏✌"
  }
 }
+
+pub mod all_newlines {
+ pub const SIXTY_FOUR_B: &str = repeat8!("\n\n\n\n\n\n\n\n");
+ pub const FOUR_KIB: &str = repeat8!(repeat8!(repeat8!("\n\n\n\n\n\n\n\n")));
+ pub const THIRTY_TWO_KIB: &str = repeat8!(repeat8!(repeat8!(repeat8!("\n\n\n\n\n\n\n\n"))));
+}
@@ -0,0 +1,51 @@
+use super::corpora::*;
+use test::{black_box, Bencher};
+
+// FIXME: this is partially duplicated in char_count.rs
+macro_rules! define_benches {
+ ($( fn $name: ident($arg: ident: &str) $body: block )+) => {
+ define_benches!(mod en_tiny, en::TINY, $($name $arg $body)+);
+ define_benches!(mod en_small, en::SMALL, $($name $arg $body)+);
+ define_benches!(mod en_medium, en::MEDIUM, $($name $arg $body)+);
+ define_benches!(mod en_large, en::LARGE, $($name $arg $body)+);
+ define_benches!(mod en_huge, en::HUGE, $($name $arg $body)+);
+
+ define_benches!(mod zh_tiny, zh::TINY, $($name $arg $body)+);
+ define_benches!(mod zh_small, zh::SMALL, $($name $arg $body)+);
+ define_benches!(mod zh_medium, zh::MEDIUM, $($name $arg $body)+);
+ define_benches!(mod zh_large, zh::LARGE, $($name $arg $body)+);
+ define_benches!(mod zh_huge, zh::HUGE, $($name $arg $body)+);
+
+ define_benches!(mod all_newlines_64b, all_newlines::SIXTY_FOUR_B, $($name $arg $body)+);
+ define_benches!(mod all_newlines_4kib, all_newlines::FOUR_KIB, $($name $arg $body)+);
+ define_benches!(mod all_newlines_32kib, all_newlines::THIRTY_TWO_KIB, $($name $arg $body)+);
+ };
+ (mod $mod_name: ident, $input: expr, $($name: ident $arg: ident $body: block)+) => {
+ mod $mod_name {
+ use super::*;
+ $(
+ #[bench]
+ fn $name(bencher: &mut Bencher) {
+ let input = $input;
+ bencher.bytes = input.len() as u64;
+ let mut input_s = input.to_string();
+ bencher.iter(|| {
+ let $arg: &str = &black_box(&mut input_s);
+ black_box($body)
+ })
+ }
+ )+
+ }
+ };
+}
+
+define_benches! {
+ fn case00_libcore(s: &str) {
+ s.lines().count()
+ }
+
+ fn case01_fold_increment(s: &str) {
+ // same as the default `Iterator::count()` impl.
+ s.lines().fold(0, |count, _| count + 1)
+ }
+}
@@ -1,5 +1,7 @@
-//! Code for efficiently counting the number of `char`s in a UTF-8 encoded
-//! string.
+//! Code for efficiently counting the number of `char`s or lines in a UTF-8
+//! encoded string
+//!
+//! ## `char` count details
 //!
 //! Broadly, UTF-8 encodes `char`s as a "leading" byte which begins the `char`,
 //! followed by some number (possibly 0) of continuation bytes.
@@ -21,21 +23,76 @@ use core::intrinsics::unlikely;
 
 const USIZE_SIZE: usize = core::mem::size_of::<usize>();
 const UNROLL_INNER: usize = 4;
+const LSB: usize = usize::repeat_u8(0x01);
 
 #[inline]
 pub(super) fn count_chars(s: &str) -> usize {
+ count::<CharCount>(s)
+}
+
+#[inline]
+pub(super) fn count_lines(s: &str) -> usize {
+ let newline_count = count::<NewlineCount>(s);
+ // The logic for going from newline count to line count is a bit weird,
+ // consider that `"foo\nbar"` is 2 lines, `"foo\nbar\n"` is also 2 lines,
+ // `"\n"` is one line, and `""` is zero lines.
+ let ends_with_newline = s.as_bytes().last() == Some(&b'\n');
+ let is_single_newline = ends_with_newline && s.len() == 1;
+ let is_special = is_single_newline || s.is_empty();
+ let adjust_len_by_one = !ends_with_newline && !is_special;
+ newline_count + adjust_len_by_one as usize
+}
+
+trait CountPred {
+ /// Bytes in `u` which match the pred must be `0x01` in the result, bytes
+ /// which fail the pred must be `0x00`.
+ fn test_each_byte_in_word(u: usize) -> usize;
+ /// Slow path for small inputs.
+ fn count_general_case(s: &[u8]) -> usize;
+}
+
+struct CharCount;
+impl CountPred for CharCount {
+ #[inline]
+ fn count_general_case(s: &[u8]) -> usize {
+ char_count_general_case(s)
+ }
+ #[inline]
+ fn test_each_byte_in_word(u: usize) -> usize {
+ contains_non_continuation_byte(u)
+ }
+}
+struct NewlineCount;
+impl CountPred for NewlineCount {
+ #[inline]
+ fn count_general_case(s: &[u8]) -> usize {
+ s.iter().filter(|b| **b == b'\n').count()
+ }
+ #[inline]
+ fn test_each_byte_in_word(u: usize) -> usize {
+ const NEWLINES: usize = usize::repeat_u8(b'\n');
+ const NOT_MSB: usize = usize::repeat_u8(0x7f);
+ // bytes of `diff` are nonzero when bytes of `u` don't contain newline
+ let diff = u ^ NEWLINES;
+ let res = !(((diff & NOT_MSB).wrapping_add(NOT_MSB) | diff) >> 7);
+ res & LSB
+ }
+}
+
+#[inline]
+fn count<P: CountPred>(s: &str) -> usize {
  if s.len() < USIZE_SIZE * UNROLL_INNER {
  // Avoid entering the optimized implementation for strings where the
  // difference is not likely to matter, or where it might even be slower.
  // That said, a ton of thought was not spent on the particular threshold
  // here, beyond "this value seems to make sense".
- char_count_general_case(s.as_bytes())
+ P::count_general_case(s.as_bytes())
  } else {
- do_count_chars(s)
+ do_count::<P>(s)
  }
 }
 
-fn do_count_chars(s: &str) -> usize {
+fn do_count<P: CountPred>(s: &str) -> usize {
  // For correctness, `CHUNK_SIZE` must be:
  //
  // - Less than or equal to 255, otherwise we'll overflow bytes in `counts`.
@@ -62,13 +119,13 @@ fn do_count_chars(s: &str) -> usize {
  // mode).
  //
  // The `unlikely` helps discourage LLVM from inlining the body, which is
- // nice, as we would rather not mark the `char_count_general_case` function
+ // nice, as we would rather not mark the `P::count_general_case` function
  // as cold.
  if unlikely(body.is_empty() || head.len() > USIZE_SIZE || tail.len() > USIZE_SIZE) {
- return char_count_general_case(s.as_bytes());
+ return P::count_general_case(s.as_bytes());
  }
 
- let mut total = char_count_general_case(head) + char_count_general_case(tail);
+ let mut total = P::count_general_case(head) + P::count_general_case(tail);
  // Split `body` into `CHUNK_SIZE` chunks to reduce the frequency with which
  // we call `sum_bytes_in_usize`.
  for chunk in body.chunks(CHUNK_SIZE) {
@@ -81,7 +138,7 @@ fn do_count_chars(s: &str) -> usize {
  for &word in unrolled {
  // Because `CHUNK_SIZE` is < 256, this addition can't cause the
  // count in any of the bytes to overflow into a subsequent byte.
- counts += contains_non_continuation_byte(word);
+ counts += P::test_each_byte_in_word(word);
  }
  }
 
@@ -97,7 +154,7 @@ fn do_count_chars(s: &str) -> usize {
  // Accumulate all the data in the remainder.
  let mut counts = 0;
  for &word in remainder {
- counts += contains_non_continuation_byte(word);
+ counts += P::test_each_byte_in_word(word);
  }
  total += sum_bytes_in_usize(counts);
  break;
@@ -112,7 +169,6 @@ fn do_count_chars(s: &str) -> usize {
 // true)
 #[inline]
 fn contains_non_continuation_byte(w: usize) -> usize {
- const LSB: usize = usize::repeat_u8(0x01);
  ((!w >> 7) | (w >> 6)) & LSB
 }
 
 
@@ -1174,6 +1174,11 @@ impl<'a> Iterator for Lines<'a> {
  fn last(mut self) -> Option<&'a str> {
  self.next_back()
  }
+
+ #[inline]
+ fn count(self) -> usize {
+ self.remainder().map_or(0, super::count::count_lines)
+ }
 }
 
 #[stable(feature = "rust1", since = "1.0.0")]
 
@@ -5,15 +5,11 @@ use std::env;
 use std::fs::File;
 use std::io;
 use std::io::{Read, Write};
-use std::process::{Command, Stdio};
 use std::path::PathBuf;
+use std::process::{Command, Stdio};
 
 fn main() {
- if env::args().len() > 1 {
- child().unwrap()
- } else {
- parent().unwrap()
- }
+ if env::args().len() > 1 { child().unwrap() } else { parent().unwrap() }
 }
 
 fn parent() -> io::Result<()> {
@@ -55,7 +51,7 @@ fn parent() -> io::Result<()> {
  for line in data.lines() {
  assert_eq!(line, "foo");
  }
- assert_eq!(data.lines().count(), 8);
+ assert_eq!(data.lines().count(), 8, "{:?}", data);
  Ok(())
 }
Original file line number	Diff line number	Diff line change
`@@ -1515,6 +1515,7 @@ fn test_lines() {`
`1515`	`1515`	`fn t(data: &str, expected: &[&str]) {`
`1516`	`1516`	`let lines: Vec<&str> = data.lines().collect();`
`1517`	`1517`	`assert_eq!(lines, expected);`
	`1518`	`+ assert_eq!(data.lines().count(), expected.len(), "{:?} vs {:?}", data, expected);`
`1518`	`1519`	`}`
`1519`	`1520`	`t("", &[]);`
`1520`	`1521`	`t("\n", &[""]);`
Original file line number	Diff line number	Diff line change
`@@ -1174,6 +1174,11 @@ impl<'a> Iterator for Lines<'a> {`
`1174`	`1174`	`fn last(mut self) -> Option<&'a str> {`
`1175`	`1175`	`self.next_back()`
`1176`	`1176`	`}`
	`1177`	`+`
	`1178`	`+ #[inline]`
	`1179`	`+ fn count(self) -> usize {`
	`1180`	`+ self.remainder().map_or(0, super::count::count_lines)`
	`1181`	`+ }`
`1177`	`1182`	`}`
`1178`	`1183`
`1179`	`1184`	`#[stable(feature = "rust1", since = "1.0.0")]`