add backwards iterator

PSeitz-dd · PSeitz-dd · commit 7beb8a60b73d · 2025-07-15T20:35:02.000+08:00
diff --git a/src/lib.rs b/src/lib.rs
@@ -138,7 +138,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&uw1[..], b);
     /// ```
-    fn unicode_words(&self) -> UnicodeWords;
+    fn unicode_words(&self) -> UnicodeWords<'_>;
 
     /// Returns an iterator over the words of `self`, separated on
     /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
@@ -162,7 +162,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&uwi1[..], b);
     /// ```
-    fn unicode_word_indices(&self) -> UnicodeWordIndices;
+    fn unicode_word_indices(&self) -> UnicodeWordIndices<'_>;
 
     /// Returns an iterator over substrings of `self` separated on
     /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
@@ -178,7 +178,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&swu1[..], b);
     /// ```
-    fn split_word_bounds(&self) -> UWordBounds;
+    fn split_word_bounds(&self) -> UWordBounds<'_>;
 
     /// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
     /// and their offsets. See `split_word_bounds()` for more information.
@@ -193,7 +193,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&swi1[..], b);
     /// ```
-    fn split_word_bound_indices(&self) -> UWordBoundIndices;
+    fn split_word_bound_indices(&self) -> UWordBoundIndices<'_>;
 
     /// Returns an iterator over substrings of `self` separated on
     /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
@@ -215,7 +215,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&us1[..], b);
     /// ```
-    fn unicode_sentences(&self) -> impl Iterator<Item = &'_ str>;
+    fn unicode_sentences(&self) -> UnicodeSentences<'_>;
 
     /// Returns an iterator over substrings of `self` separated on
     /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
@@ -253,7 +253,7 @@ pub trait UnicodeSegmentation {
 
 impl UnicodeSegmentation for str {
     #[inline]
-    fn graphemes(&self, is_extended: bool) -> Graphemes {
+    fn graphemes(&self, is_extended: bool) -> Graphemes<'_> {
         grapheme::new_graphemes(self, is_extended)
     }
 
@@ -263,32 +263,32 @@ impl UnicodeSegmentation for str {
     }
 
     #[inline]
-    fn unicode_words(&self) -> UnicodeWords {
+    fn unicode_words(&self) -> UnicodeWords<'_> {
         word::new_unicode_words(self)
     }
 
     #[inline]
-    fn unicode_word_indices(&self) -> UnicodeWordIndices {
+    fn unicode_word_indices(&self) -> UnicodeWordIndices<'_> {
         word::new_unicode_word_indices(self)
     }
 
     #[inline]
-    fn split_word_bounds(&self) -> UWordBounds {
+    fn split_word_bounds(&self) -> UWordBounds<'_> {
         word::new_word_bounds(self)
     }
 
     #[inline]
-    fn split_word_bound_indices(&self) -> UWordBoundIndices {
+    fn split_word_bound_indices(&self) -> UWordBoundIndices<'_> {
         word::new_word_bound_indices(self)
     }
 
     #[inline]
-    fn unicode_sentences(&self) -> impl Iterator<Item = &'_ str> {
+    fn unicode_sentences(&self) -> UnicodeSentences<'_> {
         sentence::new_unicode_sentences(self)
     }
 
     #[inline]
-    fn split_sentence_bounds(&self) -> USentenceBounds {
+    fn split_sentence_bounds(&self) -> USentenceBounds<'_> {
         sentence::new_sentence_bounds(self)
     }
 
diff --git a/src/word.rs b/src/word.rs
@@ -11,7 +11,6 @@
 extern crate alloc;
 use alloc::boxed::Box;
 use core::cmp;
-use core::iter::Filter;
 
 use crate::tables::word::WordCat;
 
@@ -28,7 +27,7 @@ use crate::tables::word::WordCat;
 /// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
 pub struct UnicodeWords<'a> {
-    inner: Box<dyn Iterator<Item = &'a str> + 'a>,
+    inner: Box<dyn DoubleEndedIterator<Item = &'a str> + 'a>,
 }
 
 impl<'a> Iterator for UnicodeWords<'a> {
@@ -45,6 +44,13 @@ impl<'a> Iterator for UnicodeWords<'a> {
     }
 }
 
+impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<&'a str> {
+        self.inner.next_back()
+    }
+}
+
 /// An iterator over the substrings of a string which, after splitting the string on
 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
 /// contain any characters with the
@@ -58,16 +64,15 @@ impl<'a> Iterator for UnicodeWords<'a> {
 ///
 /// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
-#[derive(Debug)]
 pub struct UnicodeWordIndices<'a> {
     #[allow(clippy::type_complexity)]
-    inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
+    inner: Box<dyn DoubleEndedIterator<Item = (usize, &'a str)> + 'a>,
 }
 
 impl<'a> Iterator for UnicodeWordIndices<'a> {
     type Item = (usize, &'a str);
 
-    #[inline]
+    #[inline(always)]
     fn next(&mut self) -> Option<(usize, &'a str)> {
         self.inner.next()
     }
@@ -722,12 +727,12 @@ impl<'a> AsciiWordBoundIter<'a> {
         AsciiWordBoundIter { rest: s, offset: 0 }
     }
 
-    #[inline(always)]
+    #[inline]
     fn is_core(b: u8) -> bool {
         b.is_ascii_alphanumeric() || b == b'_'
     }
 
-    #[inline(always)]
+    #[inline]
     fn is_infix(b: u8, prev: u8, next: u8) -> bool {
         match b {
             // numeric separators
@@ -744,6 +749,7 @@ impl<'a> AsciiWordBoundIter<'a> {
 impl<'a> Iterator for AsciiWordBoundIter<'a> {
     type Item = (usize, &'a str);
 
+    #[inline]
     fn next(&mut self) -> Option<Self::Item> {
         if self.rest.is_empty() {
             return None;
@@ -802,6 +808,66 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> {
     }
 }
 
+impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
+    fn next_back(&mut self) -> Option<(usize, &'a str)> {
+        let rest = self.rest;
+        if rest.is_empty() {
+            return None;
+        }
+        let bytes = rest.as_bytes();
+        let len = bytes.len();
+
+        // 1) Trailing spaces
+        if bytes[len - 1] == b' ' {
+            // find start of this last run of spaces
+            let mut start = len - 1;
+            while start > 0 && bytes[start - 1] == b' ' {
+                start -= 1;
+            }
+            let word = &rest[start..];
+            let pos = self.offset + start;
+            self.rest = &rest[..start];
+            return Some((pos, word));
+        }
+
+        // 2) Trailing core-run (letters/digits/underscore + infix)
+        if Self::is_core(bytes[len - 1]) {
+            // scan backwards as long as we see `is_core` or an `is_infix`
+            let mut start = len - 1;
+            while start > 0 {
+                let b = bytes[start - 1];
+                let prev = if start >= 2 { bytes[start - 2] } else { b };
+                let next = bytes[start]; // the byte we just included
+                if Self::is_core(b) || Self::is_infix(b, prev, next) {
+                    start -= 1;
+                } else {
+                    break;
+                }
+            }
+            let word = &rest[start..];
+            let pos = self.offset + start;
+            self.rest = &rest[..start];
+            return Some((pos, word));
+        }
+
+        // 3) CR+LF at end
+        if len >= 2 && bytes[len - 2] == b'\r' && bytes[len - 1] == b'\n' {
+            let start = len - 2;
+            let word = &rest[start..];
+            let pos = self.offset + start;
+            self.rest = &rest[..start];
+            return Some((pos, word));
+        }
+
+        // 4) Single non-core byte
+        let start = len - 1;
+        let word = &rest[start..];
+        let pos = self.offset + start;
+        self.rest = &rest[..start];
+        Some((pos, word))
+    }
+}
+
 #[inline]
 pub fn new_word_bounds(s: &str) -> UWordBounds<'_> {
     UWordBounds {
@@ -832,20 +898,25 @@ fn has_alphanumeric(s: &&str) -> bool {
 }
 
 #[inline]
-fn new_unicode_words_ascii<'a>(s: &'a str) -> impl Iterator<Item = &'a str> + 'a {
+fn has_ascii_alphanumeric(s: &&str) -> bool {
+    s.chars().any(|c| c.is_ascii_alphanumeric())
+}
+
+#[inline]
+fn new_unicode_words_ascii<'a>(s: &'a str) -> impl DoubleEndedIterator<Item = &'a str> + 'a {
     new_ascii_word_bound_indices(s)
         .map(|(_, w)| w)
-        .filter(|w| w.chars().any(|c| c.is_ascii_alphanumeric()))
+        .filter(has_ascii_alphanumeric)
 }
 
 #[inline]
-fn new_unicode_words_general<'a>(s: &'a str) -> impl Iterator<Item = &'a str> + 'a {
+fn new_unicode_words_general<'a>(s: &'a str) -> impl DoubleEndedIterator<Item = &'a str> + 'a {
     new_word_bounds(s).filter(has_alphanumeric)
 }
 
 #[inline]
 pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
-    let iter: Box<dyn Iterator<Item = &str>> = if s.is_ascii() {
+    let iter: Box<dyn DoubleEndedIterator<Item = &str>> = if s.is_ascii() {
         Box::new(new_unicode_words_ascii(s))
     } else {
         Box::new(new_unicode_words_general(s))
@@ -855,14 +926,13 @@ pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
 }
 
 #[inline]
-pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> {
-    use super::UnicodeSegmentation;
-
-    UnicodeWordIndices {
-        inner: s
-            .split_word_bound_indices()
-            .filter(|(_, c)| has_alphanumeric(c)),
-    }
+pub fn new_unicode_word_indices<'a>(s: &'a str) -> UnicodeWordIndices<'a> {
+    let iter: Box<dyn DoubleEndedIterator<Item = (usize, &str)>> = if s.is_ascii() {
+        Box::new(new_ascii_word_bound_indices(s).filter(|(_, w)| has_ascii_alphanumeric(w)))
+    } else {
+        Box::new(new_word_bound_indices(s).filter(|(_, w)| has_alphanumeric(w)))
+    };
+    UnicodeWordIndices { inner: iter }
 }
 
 #[cfg(test)]
@@ -921,5 +991,17 @@ mod tests {
 
             prop_assert_eq!(fast, uni);
         }
+
+        /// Fast path must equal general path for any ASCII input, forwards and backwards.
+        #[test]
+        fn proptest_ascii_matches_unicode_word_indices_rev(
+            // Vec<char> → String, length 0‒99
+            s in proptest::collection::vec(ascii_char(), 0..100)
+                   .prop_map(|v| v.into_iter().collect::<String>())
+        ) {
+            let fast_rev: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).rev().collect();
+            let uni_rev : Vec<(usize, &str)> = new_word_bound_indices(&s).rev().collect();
+            prop_assert_eq!(fast_rev, uni_rev);
+        }
     }
 }