Skip to content

Commit e822ec3

Browse files
committed
add .duplicate() and .duplicate_by(..) operations
Uses a HashMap to detect duplicates in an iterator and emits them only once. Items are never cloned. Signed-off-by: Petros Angelatos <petrosagg@gmail.com>
1 parent 4d902e3 commit e822ec3

File tree

4 files changed

+281
-0
lines changed

4 files changed

+281
-0
lines changed

src/duplicate_impl.rs

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
use std::cmp::{max, min};
2+
use std::collections::HashMap;
3+
use std::hash::Hash;
4+
use std::fmt;
5+
6+
/// An iterator adapter to filter out duplicate elements.
7+
///
8+
/// See [`.duplicate_by()`](../trait.Itertools.html#method.duplicate) for more information.
9+
#[derive(Clone)]
10+
#[must_use = "iterator adaptors are lazy and do nothing unless consumed"]
11+
pub struct DuplicateBy<I: Iterator, V, F> {
12+
iter: I,
13+
used: HashMap<V, bool>,
14+
pending: usize,
15+
f: F,
16+
}
17+
18+
impl<I, V, F> fmt::Debug for DuplicateBy<I, V, F>
19+
where I: Iterator + fmt::Debug,
20+
V: fmt::Debug + Hash + Eq,
21+
{
22+
debug_fmt_fields!(DuplicateBy, iter, used);
23+
}
24+
25+
/// Create a new `DuplicateBy` iterator.
26+
pub fn duplicate_by<I, V, F>(iter: I, f: F) -> DuplicateBy<I, V, F>
27+
where V: Eq + Hash,
28+
F: FnMut(&I::Item) -> V,
29+
I: Iterator,
30+
{
31+
DuplicateBy {
32+
iter,
33+
used: HashMap::new(),
34+
pending: 0,
35+
f,
36+
}
37+
}
38+
39+
impl<I, V, F> Iterator for DuplicateBy<I, V, F>
40+
where I: Iterator,
41+
V: Eq + Hash,
42+
F: FnMut(&I::Item) -> V
43+
{
44+
type Item = I::Item;
45+
46+
fn next(&mut self) -> Option<Self::Item> {
47+
while let Some(v) = self.iter.next() {
48+
let key = (self.f)(&v);
49+
match self.used.get_mut(&key) {
50+
None => {
51+
self.used.insert(key, false);
52+
self.pending += 1;
53+
},
54+
Some(true) => (),
55+
Some(produced) => {
56+
*produced = true;
57+
self.pending -= 1;
58+
return Some(v);
59+
},
60+
}
61+
}
62+
None
63+
}
64+
65+
#[inline]
66+
fn size_hint(&self) -> (usize, Option<usize>) {
67+
let (_, hi) = self.iter.size_hint();
68+
// There are `hi` number of items left in the base iterator. In the best case scenario,
69+
// these items are exactly the same as the ones pending (i.e items seen exactly once so
70+
// far), plus (hi - pending) / 2 pairs of never seen before items.
71+
let hi = hi.map(|hi| {
72+
let max_pending = min(self.pending, hi);
73+
let max_new = max(hi - self.pending, 0) / 2;
74+
max_pending + max_new
75+
});
76+
// The lower bound is always 0 since we might only get unique items from now on
77+
(0, hi)
78+
}
79+
}
80+
81+
impl<I, V, F> DoubleEndedIterator for DuplicateBy<I, V, F>
82+
where I: DoubleEndedIterator,
83+
V: Eq + Hash,
84+
F: FnMut(&I::Item) -> V
85+
{
86+
fn next_back(&mut self) -> Option<Self::Item> {
87+
while let Some(v) = self.iter.next_back() {
88+
let key = (self.f)(&v);
89+
match self.used.get_mut(&key) {
90+
None => {
91+
self.used.insert(key, false);
92+
self.pending += 1;
93+
},
94+
Some(true) => (),
95+
Some(produced) => {
96+
*produced = true;
97+
self.pending -= 1;
98+
return Some(v);
99+
},
100+
}
101+
}
102+
None
103+
}
104+
}
105+
106+
impl<I> Iterator for Duplicate<I>
107+
where I: Iterator,
108+
I::Item: Eq + Hash
109+
{
110+
type Item = I::Item;
111+
112+
fn next(&mut self) -> Option<Self::Item> {
113+
while let Some(v) = self.iter.iter.next() {
114+
match self.iter.used.get_mut(&v) {
115+
None => {
116+
self.iter.used.insert(v, false);
117+
self.iter.pending += 1;
118+
},
119+
Some(true) => (),
120+
Some(produced) => {
121+
*produced = true;
122+
self.iter.pending -= 1;
123+
return Some(v);
124+
},
125+
}
126+
}
127+
None
128+
}
129+
130+
#[inline]
131+
fn size_hint(&self) -> (usize, Option<usize>) {
132+
let (_, hi) = self.iter.iter.size_hint();
133+
// There are `hi` number of items left in the base iterator. In the best case scenario,
134+
// these items are exactly the same as the ones pending (i.e items seen exactly once so
135+
// far), plus (hi - pending) / 2 pairs of never seen before items.
136+
let hi = hi.map(|hi| {
137+
let max_pending = min(self.iter.pending, hi);
138+
let max_new = max(hi - self.iter.pending, 0) / 2;
139+
max_pending + max_new
140+
});
141+
// The lower bound is always 0 since we might only get unique items from now on
142+
(0, hi)
143+
}
144+
}
145+
146+
impl<I> DoubleEndedIterator for Duplicate<I>
147+
where I: DoubleEndedIterator,
148+
I::Item: Eq + Hash
149+
{
150+
fn next_back(&mut self) -> Option<Self::Item> {
151+
while let Some(v) = self.iter.iter.next_back() {
152+
match self.iter.used.get_mut(&v) {
153+
None => {
154+
self.iter.used.insert(v, false);
155+
self.iter.pending += 1;
156+
},
157+
Some(true) => (),
158+
Some(produced) => {
159+
*produced = true;
160+
self.iter.pending -= 1;
161+
return Some(v);
162+
},
163+
}
164+
}
165+
None
166+
}
167+
}
168+
169+
/// An iterator adapter to filter out duplicate elements.
170+
///
171+
/// See [`.duplicate()`](../trait.Itertools.html#method.duplicate) for more information.
172+
#[derive(Clone)]
173+
#[must_use = "iterator adaptors are lazy and do nothing unless consumed"]
174+
pub struct Duplicate<I: Iterator> {
175+
iter: DuplicateBy<I, I::Item, ()>,
176+
}
177+
178+
impl<I> fmt::Debug for Duplicate<I>
179+
where I: Iterator + fmt::Debug,
180+
I::Item: Hash + Eq + fmt::Debug,
181+
{
182+
debug_fmt_fields!(Duplicate, iter);
183+
}
184+
185+
pub fn duplicate<I>(iter: I) -> Duplicate<I>
186+
where I: Iterator,
187+
I::Item: Eq + Hash,
188+
{
189+
Duplicate {
190+
iter: DuplicateBy {
191+
iter,
192+
used: HashMap::new(),
193+
pending: 0,
194+
f: (),
195+
}
196+
}
197+
}

src/lib.rs

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,8 @@ pub mod structs {
147147
pub use crate::tee::Tee;
148148
pub use crate::tuple_impl::{TupleBuffer, TupleWindows, CircularTupleWindows, Tuples};
149149
#[cfg(feature = "use_std")]
150+
pub use crate::duplicate_impl::{Duplicate, DuplicateBy};
151+
#[cfg(feature = "use_std")]
150152
pub use crate::unique_impl::{Unique, UniqueBy};
151153
pub use crate::with_position::WithPosition;
152154
pub use crate::zip_eq_impl::ZipEq;
@@ -228,6 +230,8 @@ mod sources;
228230
mod tee;
229231
mod tuple_impl;
230232
#[cfg(feature = "use_std")]
233+
mod duplicate_impl;
234+
#[cfg(feature = "use_std")]
231235
mod unique_impl;
232236
mod with_position;
233237
mod zip_eq_impl;
@@ -1145,6 +1149,54 @@ pub trait Itertools : Iterator {
11451149
adaptors::dedup_by_with_count(self, cmp)
11461150
}
11471151

1152+
/// Return an iterator adaptor that produces elements that appear more than once during the
1153+
/// iteration. Duplicates are detected using hash and equality.
1154+
///
1155+
/// The iterator is stable, returning the duplicate items in the order in which they occur in
1156+
/// the adapted iterator. Each duplicate item is returned exactly once. If an item appears more
1157+
/// than twice, the second item is the item retained and the rest are discarded.
1158+
///
1159+
/// ```
1160+
/// use itertools::Itertools;
1161+
///
1162+
/// let data = vec![10, 20, 30, 20, 40, 10, 50];
1163+
/// itertools::assert_equal(data.into_iter().duplicate(),
1164+
/// vec![20, 10]);
1165+
/// ```
1166+
#[cfg(feature = "use_std")]
1167+
fn duplicate(self) -> Duplicate<Self>
1168+
where Self: Sized,
1169+
Self::Item: Eq + Hash
1170+
{
1171+
duplicate_impl::duplicate(self)
1172+
}
1173+
1174+
/// Return an iterator adaptor that produces elements that appear more than once during the
1175+
/// iteration. Duplicates are detected using hash and equality.
1176+
///
1177+
/// Duplicates are detected by comparing the key they map to with the keying function `f` by
1178+
/// hash and equality. The keys are stored in a hash map in the iterator.
1179+
///
1180+
/// The iterator is stable, returning the duplicate items in the order in which they occur in
1181+
/// the adapted iterator. Each duplicate item is returned exactly once. If an item appears more
1182+
/// than twice, the second item is the item retained and the rest are discarded.
1183+
///
1184+
/// ```
1185+
/// use itertools::Itertools;
1186+
///
1187+
/// let data = vec!["a", "bb", "aa", "c", "ccc"];
1188+
/// itertools::assert_equal(data.into_iter().duplicate_by(|s| s.len()),
1189+
/// vec!["aa", "c"]);
1190+
/// ```
1191+
#[cfg(feature = "use_std")]
1192+
fn duplicate_by<V, F>(self, f: F) -> DuplicateBy<Self, V, F>
1193+
where Self: Sized,
1194+
V: Eq + Hash,
1195+
F: FnMut(&Self::Item) -> V
1196+
{
1197+
duplicate_impl::duplicate_by(self, f)
1198+
}
1199+
11481200
/// Return an iterator adaptor that filters out elements that have
11491201
/// already been produced once during the iteration. Duplicates
11501202
/// are detected using hash and equality.

tests/quick.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -915,6 +915,12 @@ quickcheck! {
915915
}
916916
}
917917

918+
quickcheck! {
919+
fn size_duplicate(it: Iter<i8>) -> bool {
920+
correct_size_hint(it.duplicate())
921+
}
922+
}
923+
918924
quickcheck! {
919925
fn size_unique(it: Iter<i8>) -> bool {
920926
correct_size_hint(it.unique())

tests/test_std.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,32 @@ fn interleave_shortest() {
5959
assert_eq!(it.size_hint(), (6, Some(6)));
6060
}
6161

62+
#[test]
63+
fn duplicate_by() {
64+
let xs = ["aaa", "bbbbb", "aa", "ccc", "bbbb", "aaaaa", "cccc"];
65+
let ys = ["aa", "bbbb", "cccc"];
66+
it::assert_equal(ys.iter(), xs.iter().duplicate_by(|x| x[..2].to_string()));
67+
it::assert_equal(ys.iter(), xs.iter().rev().duplicate_by(|x| x[..2].to_string()).rev());
68+
let ys_rev = ["ccc", "aa", "bbbbb"];
69+
it::assert_equal(ys_rev.iter(), xs.iter().duplicate_by(|x| x[..2].to_string()).rev());
70+
}
71+
72+
#[test]
73+
fn duplicate() {
74+
let xs = [0, 1, 2, 3, 2, 1, 3];
75+
let ys = [2, 1, 3];
76+
it::assert_equal(ys.iter(), xs.iter().duplicate());
77+
it::assert_equal(ys.iter(), xs.iter().rev().duplicate().rev());
78+
let ys_rev = [3, 2, 1];
79+
it::assert_equal(ys_rev.iter(), xs.iter().duplicate().rev());
80+
81+
let xs = [0, 1, 0, 1];
82+
let ys = [0, 1];
83+
it::assert_equal(ys.iter(), xs.iter().duplicate());
84+
it::assert_equal(ys.iter(), xs.iter().rev().duplicate().rev());
85+
let ys_rev = [1, 0];
86+
it::assert_equal(ys_rev.iter(), xs.iter().duplicate().rev());
87+
}
6288

6389
#[test]
6490
fn unique_by() {

0 commit comments

Comments
 (0)