Skip to content

Commit 7beb8a6

Browse files
committed
add backwards iterator
1 parent 6f96a23 commit 7beb8a6

File tree

2 files changed

+113
-31
lines changed

2 files changed

+113
-31
lines changed

src/lib.rs

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ pub trait UnicodeSegmentation {
138138
///
139139
/// assert_eq!(&uw1[..], b);
140140
/// ```
141-
fn unicode_words(&self) -> UnicodeWords;
141+
fn unicode_words(&self) -> UnicodeWords<'_>;
142142

143143
/// Returns an iterator over the words of `self`, separated on
144144
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
@@ -162,7 +162,7 @@ pub trait UnicodeSegmentation {
162162
///
163163
/// assert_eq!(&uwi1[..], b);
164164
/// ```
165-
fn unicode_word_indices(&self) -> UnicodeWordIndices;
165+
fn unicode_word_indices(&self) -> UnicodeWordIndices<'_>;
166166

167167
/// Returns an iterator over substrings of `self` separated on
168168
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
@@ -178,7 +178,7 @@ pub trait UnicodeSegmentation {
178178
///
179179
/// assert_eq!(&swu1[..], b);
180180
/// ```
181-
fn split_word_bounds(&self) -> UWordBounds;
181+
fn split_word_bounds(&self) -> UWordBounds<'_>;
182182

183183
/// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
184184
/// and their offsets. See `split_word_bounds()` for more information.
@@ -193,7 +193,7 @@ pub trait UnicodeSegmentation {
193193
///
194194
/// assert_eq!(&swi1[..], b);
195195
/// ```
196-
fn split_word_bound_indices(&self) -> UWordBoundIndices;
196+
fn split_word_bound_indices(&self) -> UWordBoundIndices<'_>;
197197

198198
/// Returns an iterator over substrings of `self` separated on
199199
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
@@ -215,7 +215,7 @@ pub trait UnicodeSegmentation {
215215
///
216216
/// assert_eq!(&us1[..], b);
217217
/// ```
218-
fn unicode_sentences(&self) -> impl Iterator<Item = &'_ str>;
218+
fn unicode_sentences(&self) -> UnicodeSentences<'_>;
219219

220220
/// Returns an iterator over substrings of `self` separated on
221221
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
@@ -253,7 +253,7 @@ pub trait UnicodeSegmentation {
253253

254254
impl UnicodeSegmentation for str {
255255
#[inline]
256-
fn graphemes(&self, is_extended: bool) -> Graphemes {
256+
fn graphemes(&self, is_extended: bool) -> Graphemes<'_> {
257257
grapheme::new_graphemes(self, is_extended)
258258
}
259259

@@ -263,32 +263,32 @@ impl UnicodeSegmentation for str {
263263
}
264264

265265
#[inline]
266-
fn unicode_words(&self) -> UnicodeWords {
266+
fn unicode_words(&self) -> UnicodeWords<'_> {
267267
word::new_unicode_words(self)
268268
}
269269

270270
#[inline]
271-
fn unicode_word_indices(&self) -> UnicodeWordIndices {
271+
fn unicode_word_indices(&self) -> UnicodeWordIndices<'_> {
272272
word::new_unicode_word_indices(self)
273273
}
274274

275275
#[inline]
276-
fn split_word_bounds(&self) -> UWordBounds {
276+
fn split_word_bounds(&self) -> UWordBounds<'_> {
277277
word::new_word_bounds(self)
278278
}
279279

280280
#[inline]
281-
fn split_word_bound_indices(&self) -> UWordBoundIndices {
281+
fn split_word_bound_indices(&self) -> UWordBoundIndices<'_> {
282282
word::new_word_bound_indices(self)
283283
}
284284

285285
#[inline]
286-
fn unicode_sentences(&self) -> impl Iterator<Item = &'_ str> {
286+
fn unicode_sentences(&self) -> UnicodeSentences<'_> {
287287
sentence::new_unicode_sentences(self)
288288
}
289289

290290
#[inline]
291-
fn split_sentence_bounds(&self) -> USentenceBounds {
291+
fn split_sentence_bounds(&self) -> USentenceBounds<'_> {
292292
sentence::new_sentence_bounds(self)
293293
}
294294

src/word.rs

Lines changed: 101 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
extern crate alloc;
1212
use alloc::boxed::Box;
1313
use core::cmp;
14-
use core::iter::Filter;
1514

1615
use crate::tables::word::WordCat;
1716

@@ -28,7 +27,7 @@ use crate::tables::word::WordCat;
2827
/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
2928
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
3029
pub struct UnicodeWords<'a> {
31-
inner: Box<dyn Iterator<Item = &'a str> + 'a>,
30+
inner: Box<dyn DoubleEndedIterator<Item = &'a str> + 'a>,
3231
}
3332

3433
impl<'a> Iterator for UnicodeWords<'a> {
@@ -45,6 +44,13 @@ impl<'a> Iterator for UnicodeWords<'a> {
4544
}
4645
}
4746

47+
impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
48+
#[inline]
49+
fn next_back(&mut self) -> Option<&'a str> {
50+
self.inner.next_back()
51+
}
52+
}
53+
4854
/// An iterator over the substrings of a string which, after splitting the string on
4955
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
5056
/// contain any characters with the
@@ -58,16 +64,15 @@ impl<'a> Iterator for UnicodeWords<'a> {
5864
///
5965
/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
6066
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
61-
#[derive(Debug)]
6267
pub struct UnicodeWordIndices<'a> {
6368
#[allow(clippy::type_complexity)]
64-
inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
69+
inner: Box<dyn DoubleEndedIterator<Item = (usize, &'a str)> + 'a>,
6570
}
6671

6772
impl<'a> Iterator for UnicodeWordIndices<'a> {
6873
type Item = (usize, &'a str);
6974

70-
#[inline]
75+
#[inline(always)]
7176
fn next(&mut self) -> Option<(usize, &'a str)> {
7277
self.inner.next()
7378
}
@@ -722,12 +727,12 @@ impl<'a> AsciiWordBoundIter<'a> {
722727
AsciiWordBoundIter { rest: s, offset: 0 }
723728
}
724729

725-
#[inline(always)]
730+
#[inline]
726731
fn is_core(b: u8) -> bool {
727732
b.is_ascii_alphanumeric() || b == b'_'
728733
}
729734

730-
#[inline(always)]
735+
#[inline]
731736
fn is_infix(b: u8, prev: u8, next: u8) -> bool {
732737
match b {
733738
// numeric separators
@@ -744,6 +749,7 @@ impl<'a> AsciiWordBoundIter<'a> {
744749
impl<'a> Iterator for AsciiWordBoundIter<'a> {
745750
type Item = (usize, &'a str);
746751

752+
#[inline]
747753
fn next(&mut self) -> Option<Self::Item> {
748754
if self.rest.is_empty() {
749755
return None;
@@ -802,6 +808,66 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> {
802808
}
803809
}
804810

811+
impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
812+
fn next_back(&mut self) -> Option<(usize, &'a str)> {
813+
let rest = self.rest;
814+
if rest.is_empty() {
815+
return None;
816+
}
817+
let bytes = rest.as_bytes();
818+
let len = bytes.len();
819+
820+
// 1) Trailing spaces
821+
if bytes[len - 1] == b' ' {
822+
// find start of this last run of spaces
823+
let mut start = len - 1;
824+
while start > 0 && bytes[start - 1] == b' ' {
825+
start -= 1;
826+
}
827+
let word = &rest[start..];
828+
let pos = self.offset + start;
829+
self.rest = &rest[..start];
830+
return Some((pos, word));
831+
}
832+
833+
// 2) Trailing core-run (letters/digits/underscore + infix)
834+
if Self::is_core(bytes[len - 1]) {
835+
// scan backwards as long as we see `is_core` or an `is_infix`
836+
let mut start = len - 1;
837+
while start > 0 {
838+
let b = bytes[start - 1];
839+
let prev = if start >= 2 { bytes[start - 2] } else { b };
840+
let next = bytes[start]; // the byte we just included
841+
if Self::is_core(b) || Self::is_infix(b, prev, next) {
842+
start -= 1;
843+
} else {
844+
break;
845+
}
846+
}
847+
let word = &rest[start..];
848+
let pos = self.offset + start;
849+
self.rest = &rest[..start];
850+
return Some((pos, word));
851+
}
852+
853+
// 3) CR+LF at end
854+
if len >= 2 && bytes[len - 2] == b'\r' && bytes[len - 1] == b'\n' {
855+
let start = len - 2;
856+
let word = &rest[start..];
857+
let pos = self.offset + start;
858+
self.rest = &rest[..start];
859+
return Some((pos, word));
860+
}
861+
862+
// 4) Single non-core byte
863+
let start = len - 1;
864+
let word = &rest[start..];
865+
let pos = self.offset + start;
866+
self.rest = &rest[..start];
867+
Some((pos, word))
868+
}
869+
}
870+
805871
#[inline]
806872
pub fn new_word_bounds(s: &str) -> UWordBounds<'_> {
807873
UWordBounds {
@@ -832,20 +898,25 @@ fn has_alphanumeric(s: &&str) -> bool {
832898
}
833899

834900
#[inline]
835-
fn new_unicode_words_ascii<'a>(s: &'a str) -> impl Iterator<Item = &'a str> + 'a {
901+
fn has_ascii_alphanumeric(s: &&str) -> bool {
902+
s.chars().any(|c| c.is_ascii_alphanumeric())
903+
}
904+
905+
#[inline]
906+
fn new_unicode_words_ascii<'a>(s: &'a str) -> impl DoubleEndedIterator<Item = &'a str> + 'a {
836907
new_ascii_word_bound_indices(s)
837908
.map(|(_, w)| w)
838-
.filter(|w| w.chars().any(|c| c.is_ascii_alphanumeric()))
909+
.filter(has_ascii_alphanumeric)
839910
}
840911

841912
#[inline]
842-
fn new_unicode_words_general<'a>(s: &'a str) -> impl Iterator<Item = &'a str> + 'a {
913+
fn new_unicode_words_general<'a>(s: &'a str) -> impl DoubleEndedIterator<Item = &'a str> + 'a {
843914
new_word_bounds(s).filter(has_alphanumeric)
844915
}
845916

846917
#[inline]
847918
pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
848-
let iter: Box<dyn Iterator<Item = &str>> = if s.is_ascii() {
919+
let iter: Box<dyn DoubleEndedIterator<Item = &str>> = if s.is_ascii() {
849920
Box::new(new_unicode_words_ascii(s))
850921
} else {
851922
Box::new(new_unicode_words_general(s))
@@ -855,14 +926,13 @@ pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
855926
}
856927

857928
#[inline]
858-
pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> {
859-
use super::UnicodeSegmentation;
860-
861-
UnicodeWordIndices {
862-
inner: s
863-
.split_word_bound_indices()
864-
.filter(|(_, c)| has_alphanumeric(c)),
865-
}
929+
pub fn new_unicode_word_indices<'a>(s: &'a str) -> UnicodeWordIndices<'a> {
930+
let iter: Box<dyn DoubleEndedIterator<Item = (usize, &str)>> = if s.is_ascii() {
931+
Box::new(new_ascii_word_bound_indices(s).filter(|(_, w)| has_ascii_alphanumeric(w)))
932+
} else {
933+
Box::new(new_word_bound_indices(s).filter(|(_, w)| has_alphanumeric(w)))
934+
};
935+
UnicodeWordIndices { inner: iter }
866936
}
867937

868938
#[cfg(test)]
@@ -921,5 +991,17 @@ mod tests {
921991

922992
prop_assert_eq!(fast, uni);
923993
}
994+
995+
/// Fast path must equal general path for any ASCII input, forwards and backwards.
996+
#[test]
997+
fn proptest_ascii_matches_unicode_word_indices_rev(
998+
// Vec<char> → String, length 0‒99
999+
s in proptest::collection::vec(ascii_char(), 0..100)
1000+
.prop_map(|v| v.into_iter().collect::<String>())
1001+
) {
1002+
let fast_rev: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).rev().collect();
1003+
let uni_rev : Vec<(usize, &str)> = new_word_bound_indices(&s).rev().collect();
1004+
prop_assert_eq!(fast_rev, uni_rev);
1005+
}
9241006
}
9251007
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy