From 592d99d3388ea0764bebb621346c7b292d2e464b Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@datadoghq.com>
Date: Fri, 11 Jul 2025 12:12:22 +0800
Subject: [PATCH 01/13] add benchmark

---
 Cargo.toml                      |  6 ++++++
 benches/chars.rs                |  4 ++--
 benches/unicode_word_indices.rs | 37 +++++++++++++++++++++++++++++++++
 benches/word_bounds.rs          |  2 +-
 benches/words.rs                |  4 ++--
 5 files changed, 48 insertions(+), 5 deletions(-)
 create mode 100644 benches/unicode_word_indices.rs

diff --git a/Cargo.toml b/Cargo.toml
index 404f053..1aac6ea 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -24,6 +24,7 @@ no_std = [] # This is a no-op, preserved for backward compatibility only.
 [dev-dependencies]
 quickcheck = "0.7"
 criterion = "0.5"
+proptest = "1.7.0"
 
 [[bench]]
 name = "chars"
@@ -36,3 +37,8 @@ harness = false
 [[bench]]
 name = "word_bounds"
 harness = false
+
+[[bench]]
+name = "unicode_word_indices"
+harness = false
+
diff --git a/benches/chars.rs b/benches/chars.rs
index bacffa1..2654a26 100644
--- a/benches/chars.rs
+++ b/benches/chars.rs
@@ -41,7 +41,7 @@ fn bench_all(c: &mut Criterion) {
     for file in FILES {
         group.bench_with_input(
             BenchmarkId::new("grapheme", file),
-            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
             |b, content| b.iter(|| grapheme(content)),
         );
     }
@@ -49,7 +49,7 @@ fn bench_all(c: &mut Criterion) {
     for file in FILES {
         group.bench_with_input(
             BenchmarkId::new("scalar", file),
-            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
             |b, content| b.iter(|| scalar(content)),
         );
     }
diff --git a/benches/unicode_word_indices.rs b/benches/unicode_word_indices.rs
new file mode 100644
index 0000000..4c09404
--- /dev/null
+++ b/benches/unicode_word_indices.rs
@@ -0,0 +1,37 @@
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+
+use std::fs;
+use unicode_segmentation::UnicodeSegmentation;
+
+const FILES: &[&str] = &[
+    "log", //"arabic",
+    "english",
+    //"hindi",
+    "japanese",
+    //"korean",
+    //"mandarin",
+    //"russian",
+    //"source_code",
+];
+
+#[inline(always)]
+fn grapheme(text: &str) {
+    for w in text.unicode_word_indices() {
+        black_box(w);
+    }
+}
+
+fn bench_all(c: &mut Criterion) {
+    let mut group = c.benchmark_group("unicode_word_indices");
+
+    for file in FILES {
+        let input = fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap();
+        group.throughput(criterion::Throughput::Bytes(input.len() as u64));
+        group.bench_with_input(BenchmarkId::from_parameter(file), &input, |b, content| {
+            b.iter(|| grapheme(content))
+        });
+    }
+}
+
+criterion_group!(benches, bench_all);
+criterion_main!(benches);
diff --git a/benches/word_bounds.rs b/benches/word_bounds.rs
index 42d50ff..f1af7c4 100644
--- a/benches/word_bounds.rs
+++ b/benches/word_bounds.rs
@@ -27,7 +27,7 @@ fn bench_all(c: &mut Criterion) {
     for file in FILES {
         group.bench_with_input(
             BenchmarkId::new("grapheme", file),
-            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            &fs::read_to_string(format!("benches/texts/{file}.txt",)).unwrap(),
             |b, content| b.iter(|| grapheme(content)),
         );
     }
diff --git a/benches/words.rs b/benches/words.rs
index 86785d5..508bc9f 100644
--- a/benches/words.rs
+++ b/benches/words.rs
@@ -41,7 +41,7 @@ fn bench_all(c: &mut Criterion) {
     for file in FILES {
         group.bench_with_input(
             BenchmarkId::new("grapheme", file),
-            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
             |b, content| b.iter(|| grapheme(content)),
         );
     }
@@ -49,7 +49,7 @@ fn bench_all(c: &mut Criterion) {
     for file in FILES {
         group.bench_with_input(
             BenchmarkId::new("scalar", file),
-            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(),
             |b, content| b.iter(|| scalar(content)),
         );
     }

From eca90432d2943af1d5040d61b39e05a16780949e Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@datadoghq.com>
Date: Fri, 11 Jul 2025 13:08:47 +0800
Subject: [PATCH 02/13] add ascii fastpath

---
 src/lib.rs  |  25 +++--
 src/word.rs | 306 +++++++++++++++++++++++++++++++---------------------
 2 files changed, 196 insertions(+), 135 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index c8ec5b5..7672eb2 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -56,11 +56,14 @@
 )]
 #![no_std]
 
+#[cfg(test)]
+extern crate std;
+
 pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
 pub use grapheme::{GraphemeIndices, Graphemes};
 pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
 pub use tables::UNICODE_VERSION;
-pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords};
+pub use word::{UWordBoundIndices, UWordBounds};
 
 mod grapheme;
 mod sentence;
@@ -133,7 +136,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&uw1[..], b);
     /// ```
-    fn unicode_words(&self) -> UnicodeWords<'_>;
+    fn unicode_words(&self) -> impl Iterator<Item = &'_ str>;
 
     /// Returns an iterator over the words of `self`, separated on
     /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
@@ -157,7 +160,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&uwi1[..], b);
     /// ```
-    fn unicode_word_indices(&self) -> UnicodeWordIndices<'_>;
+    fn unicode_word_indices(&self) -> impl Iterator<Item = (usize, &'_ str)>;
 
     /// Returns an iterator over substrings of `self` separated on
     /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
@@ -173,7 +176,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&swu1[..], b);
     /// ```
-    fn split_word_bounds(&self) -> UWordBounds<'_>;
+    fn split_word_bounds(&self) -> impl DoubleEndedIterator<Item = &'_ str>;
 
     /// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
     /// and their offsets. See `split_word_bounds()` for more information.
@@ -188,7 +191,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&swi1[..], b);
     /// ```
-    fn split_word_bound_indices(&self) -> UWordBoundIndices<'_>;
+    fn split_word_bound_indices(&self) -> impl DoubleEndedIterator<Item = (usize, &'_ str)>;
 
     /// Returns an iterator over substrings of `self` separated on
     /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
@@ -210,7 +213,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&us1[..], b);
     /// ```
-    fn unicode_sentences(&self) -> UnicodeSentences<'_>;
+    fn unicode_sentences(&self) -> impl Iterator<Item = &'_ str>;
 
     /// Returns an iterator over substrings of `self` separated on
     /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
@@ -258,27 +261,27 @@ impl UnicodeSegmentation for str {
     }
 
     #[inline]
-    fn unicode_words(&self) -> UnicodeWords {
+    fn unicode_words(&self) -> impl Iterator<Item = &'_ str> {
         word::new_unicode_words(self)
     }
 
     #[inline]
-    fn unicode_word_indices(&self) -> UnicodeWordIndices {
+    fn unicode_word_indices(&self) -> impl Iterator<Item = (usize, &'_ str)> {
         word::new_unicode_word_indices(self)
     }
 
     #[inline]
-    fn split_word_bounds(&self) -> UWordBounds {
+    fn split_word_bounds(&self) -> impl DoubleEndedIterator<Item = &'_ str> {
         word::new_word_bounds(self)
     }
 
     #[inline]
-    fn split_word_bound_indices(&self) -> UWordBoundIndices {
+    fn split_word_bound_indices(&self) -> impl DoubleEndedIterator<Item = (usize, &'_ str)> {
         word::new_word_bound_indices(self)
     }
 
     #[inline]
-    fn unicode_sentences(&self) -> UnicodeSentences {
+    fn unicode_sentences(&self) -> impl Iterator<Item = &'_ str> {
         sentence::new_unicode_sentences(self)
     }
 
diff --git a/src/word.rs b/src/word.rs
index b2a85ae..964cdc0 100644
--- a/src/word.rs
+++ b/src/word.rs
@@ -9,85 +9,11 @@
 // except according to those terms.
 
 use core::cmp;
-use core::iter::Filter;
 
-use crate::tables::word::WordCat;
-
-/// An iterator over the substrings of a string which, after splitting the string on
-/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
-/// contain any characters with the
-/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
-/// property, or with
-/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
-///
-/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
-/// its documentation for more.
-///
-/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
-/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
-#[derive(Debug)]
-pub struct UnicodeWords<'a> {
-    inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
-}
-
-impl<'a> Iterator for UnicodeWords<'a> {
-    type Item = &'a str;
-
-    #[inline]
-    fn next(&mut self) -> Option<&'a str> {
-        self.inner.next()
-    }
-
-    #[inline]
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        self.inner.size_hint()
-    }
-}
-impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
-    #[inline]
-    fn next_back(&mut self) -> Option<&'a str> {
-        self.inner.next_back()
-    }
-}
-
-/// An iterator over the substrings of a string which, after splitting the string on
-/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
-/// contain any characters with the
-/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
-/// property, or with
-/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
-/// This iterator also provides the byte offsets for each substring.
-///
-/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
-/// its documentation for more.
-///
-/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
-/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
-#[derive(Debug)]
-pub struct UnicodeWordIndices<'a> {
-    #[allow(clippy::type_complexity)]
-    inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
-}
+extern crate alloc;
+use alloc::boxed::Box;
 
-impl<'a> Iterator for UnicodeWordIndices<'a> {
-    type Item = (usize, &'a str);
-
-    #[inline]
-    fn next(&mut self) -> Option<(usize, &'a str)> {
-        self.inner.next()
-    }
-
-    #[inline]
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        self.inner.size_hint()
-    }
-}
-impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
-    #[inline]
-    fn next_back(&mut self) -> Option<(usize, &'a str)> {
-        self.inner.next_back()
-    }
-}
+use crate::tables::word::WordCat;
 
 /// External iterator for a string's
 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
@@ -117,24 +43,6 @@ pub struct UWordBoundIndices<'a> {
     iter: UWordBounds<'a>,
 }
 
-impl<'a> UWordBoundIndices<'a> {
-    #[inline]
-    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
-    ///
-    /// ```rust
-    /// # use unicode_segmentation::UnicodeSegmentation;
-    /// let mut iter = "Hello world".split_word_bound_indices();
-    /// assert_eq!(iter.as_str(), "Hello world");
-    /// iter.next();
-    /// assert_eq!(iter.as_str(), " world");
-    /// iter.next();
-    /// assert_eq!(iter.as_str(), "world");
-    /// ```
-    pub fn as_str(&self) -> &'a str {
-        self.iter.as_str()
-    }
-}
-
 impl<'a> Iterator for UWordBoundIndices<'a> {
     type Item = (usize, &'a str);
 
@@ -677,22 +585,6 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
 }
 
 impl<'a> UWordBounds<'a> {
-    #[inline]
-    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
-    ///
-    /// ```rust
-    /// # use unicode_segmentation::UnicodeSegmentation;
-    /// let mut iter = "Hello world".split_word_bounds();
-    /// assert_eq!(iter.as_str(), "Hello world");
-    /// iter.next();
-    /// assert_eq!(iter.as_str(), " world");
-    /// iter.next();
-    /// assert_eq!(iter.as_str(), "world");
-    /// ```
-    pub fn as_str(&self) -> &'a str {
-        self.string
-    }
-
     #[inline]
     fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
         use crate::tables::word as wd;
@@ -736,33 +628,161 @@ pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> {
 
 #[inline]
 fn has_alphanumeric(s: &&str) -> bool {
-    use crate::tables::util::is_alphanumeric;
-
-    s.chars().any(is_alphanumeric)
+    s.chars().any(|c| c.is_alphanumeric())
 }
 
 #[inline]
-pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
-    use super::UnicodeSegmentation;
+fn has_ascii_alphanumeric(s: &&str) -> bool {
+    s.bytes().any(|b| b.is_ascii_alphanumeric())
+}
+
+/// Fast-path for ASCII-only word segmentation, matching `unicode-segmentation` on pure ASCII:
+/// • runs of ASCII spaces are grouped (`"   "`)  
+/// • core-runs (letters, digits, underscore + infix)  
+/// • any other ASCII char emits as one token, except CR+LF emits as a single two-char token
+pub fn new_ascii_word_bound_indices<'a>(s: &'a str) -> impl Iterator<Item = (usize, &'a str)> + 'a {
+    #[inline(always)]
+    fn is_core(b: u8) -> bool {
+        b.is_ascii_alphanumeric() || b == b'_'
+    }
+    #[inline(always)]
+    fn is_infix(b: u8, prev: u8, next: u8) -> bool {
+        match b {
+            // numeric separators
+            b'.' | b',' | b';' | b'\'' if prev.is_ascii_digit() && next.is_ascii_digit() => true,
+            // apostrophe in contractions
+            b'\'' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true,
+            // dot/colon inside letters
+            b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true,
+            _ => false,
+        }
+    }
+
+    use core::iter::from_fn;
+    let mut rest = s;
+    let mut offset = 0;
+
+    from_fn(move || {
+        if rest.is_empty() {
+            return None;
+        }
+        let bytes = rest.as_bytes();
+        let len = bytes.len();
+
+        // 1) Group runs of spaces
+        if bytes[0] == b' ' {
+            let mut i = 1;
+            while i < len && bytes[i] == b' ' {
+                i += 1;
+            }
+            let word = &rest[..i];
+            let pos = offset;
+            rest = &rest[i..];
+            offset += i;
+            return Some((pos, word));
+        }
 
-    UnicodeWords {
-        inner: s.split_word_bounds().filter(has_alphanumeric),
+        // 2) Core-run (letters/digits/underscore + infix)
+        if is_core(bytes[0]) {
+            let mut i = 1;
+            while i < len {
+                let b = bytes[i];
+                if is_core(b) || (i + 1 < len && is_infix(b, bytes[i - 1], bytes[i + 1])) {
+                    i += 1;
+                } else {
+                    break;
+                }
+            }
+            let word = &rest[..i];
+            let pos = offset;
+            rest = &rest[i..];
+            offset += i;
+            return Some((pos, word));
+        }
+
+        // 3) Non-core: CR+LF as one token, otherwise single char
+        if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' {
+            let word = &rest[..2];
+            let pos = offset;
+            rest = &rest[2..];
+            offset += 2;
+            Some((pos, word))
+        } else {
+            // emit exactly one byte (whitespace/control/punct)
+            let word = &rest[..1];
+            let pos = offset;
+            rest = &rest[1..];
+            offset += 1;
+            Some((pos, word))
+        }
+    })
+}
+/// An iterator over the substrings of a string which, after splitting the string on
+/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
+/// contain any characters with the
+/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+/// property, or with
+/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
+///
+/// This method is accessed by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
+/// its documentation for more.
+///
+/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
+/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+#[inline]
+pub(crate) fn new_unicode_words(s: &str) -> Box<dyn Iterator<Item = &str> + '_> {
+    if s.is_ascii() {
+        Box::new(new_unicode_words_ascii(s))
+    } else {
+        Box::new(new_unicode_words_general(s))
     }
 }
 
 #[inline]
-pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> {
-    use super::UnicodeSegmentation;
+fn new_unicode_words_ascii<'a>(s: &'a str) -> impl Iterator<Item = &'a str> + 'a {
+    new_ascii_word_bound_indices(s)
+        .map(|(_, w)| w)
+        .filter(|w| w.chars().any(|c| c.is_ascii_alphanumeric()))
+}
+
+#[inline]
+fn new_unicode_words_general<'a>(s: &'a str) -> impl Iterator<Item = &'a str> + 'a {
+    new_word_bounds(s).filter(has_alphanumeric)
+}
 
-    UnicodeWordIndices {
-        inner: s
-            .split_word_bound_indices()
-            .filter(|(_, c)| has_alphanumeric(c)),
+/// An iterator over the substrings of a string which, after splitting the string on
+/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
+/// contain any characters with the
+/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+/// property, or with
+/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
+/// This iterator also provides the byte offsets for each substring.
+///
+/// This method is accessed by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
+/// its documentation for more.
+///
+/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
+/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+#[inline]
+pub fn new_unicode_word_indices<'a>(s: &'a str) -> Box<dyn Iterator<Item = (usize, &'a str)> + 'a> {
+    if s.is_ascii() {
+        Box::new(new_ascii_word_bound_indices(s).filter(|(_, w)| has_ascii_alphanumeric(w)))
+    } else {
+        Box::new(new_word_bound_indices(s).filter(|(_, w)| has_alphanumeric(w)))
     }
 }
 
 #[cfg(test)]
 mod tests {
+    use crate::word::{
+        new_ascii_word_bound_indices, new_unicode_words_ascii, new_word_bound_indices,
+    };
+    use std::string::String;
+    use std::vec::Vec;
+    use std::{format, vec};
+
+    use proptest::prelude::*;
+
     #[test]
     fn test_syriac_abbr_mark() {
         use crate::tables::word as wd;
@@ -776,4 +796,42 @@ mod tests {
         let (_, _, cat) = wd::word_category('\u{6dd}');
         assert_eq!(cat, wd::WC_Numeric);
     }
+
+    #[test]
+    fn test_ascii_word_indices_various_cases() {
+        let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com";
+        let words: Vec<&str> = new_unicode_words_ascii(s).collect();
+        let expected = vec![
+            ("Hello"), // simple letters
+            ("world"), // skip comma+space, stop at '!'
+            ("can't"), // apostrophe joins letters
+            ("e.g"),
+            ("var1"),
+            ("123,456"), // digits+comma+digits
+            ("foo_bar"),
+            ("example.com"),
+        ];
+        assert_eq!(words, expected);
+    }
+
+    /// Strategy that yields every code-point from NUL (0) to DEL (127).
+    fn ascii_char() -> impl Strategy<Value = char> {
+        (0u8..=127).prop_map(|b| b as char)
+    }
+
+    proptest! {
+        #![proptest_config(ProptestConfig::with_cases(10000))]
+        /// Fast path must equal general path for any ASCII input.
+        #[test]
+        fn proptest_ascii_matches_unicode_word_indices(
+            // Vec<char> → String, length 0‒99
+            s in proptest::collection::vec(ascii_char(), 0..100)
+                   .prop_map(|v| v.into_iter().collect::<String>())
+        ) {
+            let fast: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).collect();
+            let uni:  Vec<(usize, &str)> = new_word_bound_indices(&s).collect();
+
+            prop_assert_eq!(fast, uni);
+        }
+    }
 }

From b5ed407d84bdac57ffae8f968973bf33496a0326 Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@datadoghq.com>
Date: Fri, 11 Jul 2025 14:27:30 +0800
Subject: [PATCH 03/13] add test case IP

---
 src/word.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/word.rs b/src/word.rs
index 964cdc0..c690eb6 100644
--- a/src/word.rs
+++ b/src/word.rs
@@ -799,7 +799,7 @@ mod tests {
 
     #[test]
     fn test_ascii_word_indices_various_cases() {
-        let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com";
+        let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com 127.0.0.1:9090";
         let words: Vec<&str> = new_unicode_words_ascii(s).collect();
         let expected = vec![
             ("Hello"), // simple letters
@@ -810,6 +810,8 @@ mod tests {
             ("123,456"), // digits+comma+digits
             ("foo_bar"),
             ("example.com"),
+            ("127.0.0.1"),
+            ("9090"), // port number
         ];
         assert_eq!(words, expected);
     }

From 9b1b7f998451d87c0fb8481c756778a4961f122c Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@datadoghq.com>
Date: Sun, 13 Jul 2025 18:30:30 +0800
Subject: [PATCH 04/13] add log to benches

---
 benches/texts/log.txt | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 benches/texts/log.txt

diff --git a/benches/texts/log.txt b/benches/texts/log.txt
new file mode 100644
index 0000000..e18ca32
--- /dev/null
+++ b/benches/texts/log.txt
@@ -0,0 +1 @@
+2018-07-12 13:59:01 UTC | ERROR | (worker.go:131 in process) | Too many errors for endpoint 'dummy/api/v1/check_run?api_key=*************************00000': retrying later

From 6f96a23b0234a078e05028563198d8c04c48b2e5 Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@datadoghq.com>
Date: Tue, 15 Jul 2025 19:55:08 +0800
Subject: [PATCH 05/13] restore iterators

---
 src/lib.rs  |  18 ++--
 src/word.rs | 280 ++++++++++++++++++++++++++++++++++------------------
 2 files changed, 193 insertions(+), 105 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 7672eb2..1dbdd73 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -65,6 +65,8 @@ pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
 pub use tables::UNICODE_VERSION;
 pub use word::{UWordBoundIndices, UWordBounds};
 
+use crate::word::{UnicodeWordIndices, UnicodeWords};
+
 mod grapheme;
 mod sentence;
 #[rustfmt::skip]
@@ -136,7 +138,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&uw1[..], b);
     /// ```
-    fn unicode_words(&self) -> impl Iterator<Item = &'_ str>;
+    fn unicode_words(&self) -> UnicodeWords;
 
     /// Returns an iterator over the words of `self`, separated on
     /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
@@ -160,7 +162,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&uwi1[..], b);
     /// ```
-    fn unicode_word_indices(&self) -> impl Iterator<Item = (usize, &'_ str)>;
+    fn unicode_word_indices(&self) -> UnicodeWordIndices;
 
     /// Returns an iterator over substrings of `self` separated on
     /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
@@ -176,7 +178,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&swu1[..], b);
     /// ```
-    fn split_word_bounds(&self) -> impl DoubleEndedIterator<Item = &'_ str>;
+    fn split_word_bounds(&self) -> UWordBounds;
 
     /// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
     /// and their offsets. See `split_word_bounds()` for more information.
@@ -191,7 +193,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&swi1[..], b);
     /// ```
-    fn split_word_bound_indices(&self) -> impl DoubleEndedIterator<Item = (usize, &'_ str)>;
+    fn split_word_bound_indices(&self) -> UWordBoundIndices;
 
     /// Returns an iterator over substrings of `self` separated on
     /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
@@ -261,22 +263,22 @@ impl UnicodeSegmentation for str {
     }
 
     #[inline]
-    fn unicode_words(&self) -> impl Iterator<Item = &'_ str> {
+    fn unicode_words(&self) -> UnicodeWords {
         word::new_unicode_words(self)
     }
 
     #[inline]
-    fn unicode_word_indices(&self) -> impl Iterator<Item = (usize, &'_ str)> {
+    fn unicode_word_indices(&self) -> UnicodeWordIndices {
         word::new_unicode_word_indices(self)
     }
 
     #[inline]
-    fn split_word_bounds(&self) -> impl DoubleEndedIterator<Item = &'_ str> {
+    fn split_word_bounds(&self) -> UWordBounds {
         word::new_word_bounds(self)
     }
 
     #[inline]
-    fn split_word_bound_indices(&self) -> impl DoubleEndedIterator<Item = (usize, &'_ str)> {
+    fn split_word_bound_indices(&self) -> UWordBoundIndices {
         word::new_word_bound_indices(self)
     }
 
diff --git a/src/word.rs b/src/word.rs
index c690eb6..1d1d69c 100644
--- a/src/word.rs
+++ b/src/word.rs
@@ -8,13 +8,82 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
-use core::cmp;
-
 extern crate alloc;
 use alloc::boxed::Box;
+use core::cmp;
+use core::iter::Filter;
 
 use crate::tables::word::WordCat;
 
+/// An iterator over the substrings of a string which, after splitting the string on
+/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
+/// contain any characters with the
+/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+/// property, or with
+/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
+///
+/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
+/// its documentation for more.
+///
+/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
+/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+pub struct UnicodeWords<'a> {
+    inner: Box<dyn Iterator<Item = &'a str> + 'a>,
+}
+
+impl<'a> Iterator for UnicodeWords<'a> {
+    type Item = &'a str;
+
+    #[inline]
+    fn next(&mut self) -> Option<&'a str> {
+        self.inner.next()
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
+    }
+}
+
+/// An iterator over the substrings of a string which, after splitting the string on
+/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
+/// contain any characters with the
+/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+/// property, or with
+/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
+/// This iterator also provides the byte offsets for each substring.
+///
+/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
+/// its documentation for more.
+///
+/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
+/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+#[derive(Debug)]
+pub struct UnicodeWordIndices<'a> {
+    #[allow(clippy::type_complexity)]
+    inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
+}
+
+impl<'a> Iterator for UnicodeWordIndices<'a> {
+    type Item = (usize, &'a str);
+
+    #[inline]
+    fn next(&mut self) -> Option<(usize, &'a str)> {
+        self.inner.next()
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
+    }
+}
+impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<(usize, &'a str)> {
+        self.inner.next_back()
+    }
+}
+
 /// External iterator for a string's
 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
 ///
@@ -43,6 +112,24 @@ pub struct UWordBoundIndices<'a> {
     iter: UWordBounds<'a>,
 }
 
+impl<'a> UWordBoundIndices<'a> {
+    #[inline]
+    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
+    ///
+    /// ```rust
+    /// # use unicode_segmentation::UnicodeSegmentation;
+    /// let mut iter = "Hello world".split_word_bound_indices();
+    /// assert_eq!(iter.as_str(), "Hello world");
+    /// iter.next();
+    /// assert_eq!(iter.as_str(), " world");
+    /// iter.next();
+    /// assert_eq!(iter.as_str(), "world");
+    /// ```
+    pub fn as_str(&self) -> &'a str {
+        self.iter.as_str()
+    }
+}
+
 impl<'a> Iterator for UWordBoundIndices<'a> {
     type Item = (usize, &'a str);
 
@@ -585,6 +672,22 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
 }
 
 impl<'a> UWordBounds<'a> {
+    #[inline]
+    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
+    ///
+    /// ```rust
+    /// # use unicode_segmentation::UnicodeSegmentation;
+    /// let mut iter = "Hello world".split_word_bounds();
+    /// assert_eq!(iter.as_str(), "Hello world");
+    /// iter.next();
+    /// assert_eq!(iter.as_str(), " world");
+    /// iter.next();
+    /// assert_eq!(iter.as_str(), "world");
+    /// ```
+    pub fn as_str(&self) -> &'a str {
+        self.string
+    }
+
     #[inline]
     fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
         use crate::tables::word as wd;
@@ -609,42 +712,21 @@ impl<'a> UWordBounds<'a> {
     }
 }
 
-#[inline]
-pub fn new_word_bounds(s: &str) -> UWordBounds<'_> {
-    UWordBounds {
-        string: s,
-        cat: None,
-        catb: None,
-    }
+pub struct AsciiWordBoundIter<'a> {
+    rest: &'a str,
+    offset: usize,
 }
 
-#[inline]
-pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> {
-    UWordBoundIndices {
-        start_offset: s.as_ptr() as usize,
-        iter: new_word_bounds(s),
+impl<'a> AsciiWordBoundIter<'a> {
+    pub fn new(s: &'a str) -> Self {
+        AsciiWordBoundIter { rest: s, offset: 0 }
     }
-}
-
-#[inline]
-fn has_alphanumeric(s: &&str) -> bool {
-    s.chars().any(|c| c.is_alphanumeric())
-}
 
-#[inline]
-fn has_ascii_alphanumeric(s: &&str) -> bool {
-    s.bytes().any(|b| b.is_ascii_alphanumeric())
-}
-
-/// Fast-path for ASCII-only word segmentation, matching `unicode-segmentation` on pure ASCII:
-/// • runs of ASCII spaces are grouped (`"   "`)  
-/// • core-runs (letters, digits, underscore + infix)  
-/// • any other ASCII char emits as one token, except CR+LF emits as a single two-char token
-pub fn new_ascii_word_bound_indices<'a>(s: &'a str) -> impl Iterator<Item = (usize, &'a str)> + 'a {
     #[inline(always)]
     fn is_core(b: u8) -> bool {
         b.is_ascii_alphanumeric() || b == b'_'
     }
+
     #[inline(always)]
     fn is_infix(b: u8, prev: u8, next: u8) -> bool {
         match b {
@@ -657,16 +739,17 @@ pub fn new_ascii_word_bound_indices<'a>(s: &'a str) -> impl Iterator<Item = (usi
             _ => false,
         }
     }
+}
 
-    use core::iter::from_fn;
-    let mut rest = s;
-    let mut offset = 0;
+impl<'a> Iterator for AsciiWordBoundIter<'a> {
+    type Item = (usize, &'a str);
 
-    from_fn(move || {
-        if rest.is_empty() {
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.rest.is_empty() {
             return None;
         }
-        let bytes = rest.as_bytes();
+
+        let bytes = self.rest.as_bytes();
         let len = bytes.len();
 
         // 1) Group runs of spaces
@@ -675,69 +758,79 @@ pub fn new_ascii_word_bound_indices<'a>(s: &'a str) -> impl Iterator<Item = (usi
             while i < len && bytes[i] == b' ' {
                 i += 1;
             }
-            let word = &rest[..i];
-            let pos = offset;
-            rest = &rest[i..];
-            offset += i;
+            let word = &self.rest[..i];
+            let pos = self.offset;
+            self.rest = &self.rest[i..];
+            self.offset += i;
             return Some((pos, word));
         }
 
         // 2) Core-run (letters/digits/underscore + infix)
-        if is_core(bytes[0]) {
+        if Self::is_core(bytes[0]) {
             let mut i = 1;
             while i < len {
                 let b = bytes[i];
-                if is_core(b) || (i + 1 < len && is_infix(b, bytes[i - 1], bytes[i + 1])) {
+                if Self::is_core(b)
+                    || (i + 1 < len && Self::is_infix(b, bytes[i - 1], bytes[i + 1]))
+                {
                     i += 1;
                 } else {
                     break;
                 }
             }
-            let word = &rest[..i];
-            let pos = offset;
-            rest = &rest[i..];
-            offset += i;
+            let word = &self.rest[..i];
+            let pos = self.offset;
+            self.rest = &self.rest[i..];
+            self.offset += i;
             return Some((pos, word));
         }
 
         // 3) Non-core: CR+LF as one token, otherwise single char
         if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' {
-            let word = &rest[..2];
-            let pos = offset;
-            rest = &rest[2..];
-            offset += 2;
+            let word = &self.rest[..2];
+            let pos = self.offset;
+            self.rest = &self.rest[2..];
+            self.offset += 2;
             Some((pos, word))
         } else {
-            // emit exactly one byte (whitespace/control/punct)
-            let word = &rest[..1];
-            let pos = offset;
-            rest = &rest[1..];
-            offset += 1;
+            let word = &self.rest[..1];
+            let pos = self.offset;
+            self.rest = &self.rest[1..];
+            self.offset += 1;
             Some((pos, word))
         }
-    })
+    }
 }
-/// An iterator over the substrings of a string which, after splitting the string on
-/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
-/// contain any characters with the
-/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
-/// property, or with
-/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
-///
-/// This method is accessed by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
-/// its documentation for more.
-///
-/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
-/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+
 #[inline]
-pub(crate) fn new_unicode_words(s: &str) -> Box<dyn Iterator<Item = &str> + '_> {
-    if s.is_ascii() {
-        Box::new(new_unicode_words_ascii(s))
-    } else {
-        Box::new(new_unicode_words_general(s))
+pub fn new_word_bounds(s: &str) -> UWordBounds<'_> {
+    UWordBounds {
+        string: s,
+        cat: None,
+        catb: None,
+    }
+}
+
+#[inline]
+pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> {
+    UWordBoundIndices {
+        start_offset: s.as_ptr() as usize,
+        iter: new_word_bounds(s),
     }
 }
 
+#[inline]
+pub fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> {
+    AsciiWordBoundIter::new(s)
+}
+
+#[inline]
+fn has_alphanumeric(s: &&str) -> bool {
+    use crate::tables::util::is_alphanumeric;
+
+    s.chars().any(is_alphanumeric)
+}
+
 #[inline]
 fn new_unicode_words_ascii<'a>(s: &'a str) -> impl Iterator<Item = &'a str> + 'a {
     new_ascii_word_bound_indices(s)
@@ -750,25 +843,25 @@ fn new_unicode_words_general<'a>(s: &'a str) -> impl Iterator<Item = &'a str> +
     new_word_bounds(s).filter(has_alphanumeric)
 }
 
-/// An iterator over the substrings of a string which, after splitting the string on
-/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
-/// contain any characters with the
-/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
-/// property, or with
-/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
-/// This iterator also provides the byte offsets for each substring.
-///
-/// This method is accessed by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
-/// its documentation for more.
-///
-/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
-/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
 #[inline]
-pub fn new_unicode_word_indices<'a>(s: &'a str) -> Box<dyn Iterator<Item = (usize, &'a str)> + 'a> {
-    if s.is_ascii() {
-        Box::new(new_ascii_word_bound_indices(s).filter(|(_, w)| has_ascii_alphanumeric(w)))
+pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
+    let iter: Box<dyn Iterator<Item = &str>> = if s.is_ascii() {
+        Box::new(new_unicode_words_ascii(s))
     } else {
-        Box::new(new_word_bound_indices(s).filter(|(_, w)| has_alphanumeric(w)))
+        Box::new(new_unicode_words_general(s))
+    };
+
+    UnicodeWords { inner: iter }
+}
+
+#[inline]
+pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> {
+    use super::UnicodeSegmentation;
+
+    UnicodeWordIndices {
+        inner: s
+            .split_word_bound_indices()
+            .filter(|(_, c)| has_alphanumeric(c)),
     }
 }
 
@@ -785,13 +878,6 @@ mod tests {
 
     #[test]
     fn test_syriac_abbr_mark() {
-        use crate::tables::word as wd;
-        let (_, _, cat) = wd::word_category('\u{70f}');
-        assert_eq!(cat, wd::WC_ALetter);
-    }
-
-    #[test]
-    fn test_end_of_ayah_cat() {
         use crate::tables::word as wd;
         let (_, _, cat) = wd::word_category('\u{6dd}');
         assert_eq!(cat, wd::WC_Numeric);

From 7beb8a60b73dbeecd42d16bb8f32cdfb0da2a9e3 Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@datadoghq.com>
Date: Tue, 15 Jul 2025 20:15:55 +0800
Subject: [PATCH 06/13] add backwards iterator

---
 src/lib.rs  |  24 +++++------
 src/word.rs | 120 +++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 113 insertions(+), 31 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 1dbdd73..d15ac0b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -138,7 +138,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&uw1[..], b);
     /// ```
-    fn unicode_words(&self) -> UnicodeWords;
+    fn unicode_words(&self) -> UnicodeWords<'_>;
 
     /// Returns an iterator over the words of `self`, separated on
     /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
@@ -162,7 +162,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&uwi1[..], b);
     /// ```
-    fn unicode_word_indices(&self) -> UnicodeWordIndices;
+    fn unicode_word_indices(&self) -> UnicodeWordIndices<'_>;
 
     /// Returns an iterator over substrings of `self` separated on
     /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
@@ -178,7 +178,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&swu1[..], b);
     /// ```
-    fn split_word_bounds(&self) -> UWordBounds;
+    fn split_word_bounds(&self) -> UWordBounds<'_>;
 
     /// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
     /// and their offsets. See `split_word_bounds()` for more information.
@@ -193,7 +193,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&swi1[..], b);
     /// ```
-    fn split_word_bound_indices(&self) -> UWordBoundIndices;
+    fn split_word_bound_indices(&self) -> UWordBoundIndices<'_>;
 
     /// Returns an iterator over substrings of `self` separated on
     /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
@@ -215,7 +215,7 @@ pub trait UnicodeSegmentation {
     ///
     /// assert_eq!(&us1[..], b);
     /// ```
-    fn unicode_sentences(&self) -> impl Iterator<Item = &'_ str>;
+    fn unicode_sentences(&self) -> UnicodeSentences<'_>;
 
     /// Returns an iterator over substrings of `self` separated on
     /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
@@ -253,7 +253,7 @@ pub trait UnicodeSegmentation {
 
 impl UnicodeSegmentation for str {
     #[inline]
-    fn graphemes(&self, is_extended: bool) -> Graphemes {
+    fn graphemes(&self, is_extended: bool) -> Graphemes<'_> {
         grapheme::new_graphemes(self, is_extended)
     }
 
@@ -263,32 +263,32 @@ impl UnicodeSegmentation for str {
     }
 
     #[inline]
-    fn unicode_words(&self) -> UnicodeWords {
+    fn unicode_words(&self) -> UnicodeWords<'_> {
         word::new_unicode_words(self)
     }
 
     #[inline]
-    fn unicode_word_indices(&self) -> UnicodeWordIndices {
+    fn unicode_word_indices(&self) -> UnicodeWordIndices<'_> {
         word::new_unicode_word_indices(self)
     }
 
     #[inline]
-    fn split_word_bounds(&self) -> UWordBounds {
+    fn split_word_bounds(&self) -> UWordBounds<'_> {
         word::new_word_bounds(self)
     }
 
     #[inline]
-    fn split_word_bound_indices(&self) -> UWordBoundIndices {
+    fn split_word_bound_indices(&self) -> UWordBoundIndices<'_> {
         word::new_word_bound_indices(self)
     }
 
     #[inline]
-    fn unicode_sentences(&self) -> impl Iterator<Item = &'_ str> {
+    fn unicode_sentences(&self) -> UnicodeSentences<'_> {
         sentence::new_unicode_sentences(self)
     }
 
     #[inline]
-    fn split_sentence_bounds(&self) -> USentenceBounds {
+    fn split_sentence_bounds(&self) -> USentenceBounds<'_> {
         sentence::new_sentence_bounds(self)
     }
 
diff --git a/src/word.rs b/src/word.rs
index 1d1d69c..c835445 100644
--- a/src/word.rs
+++ b/src/word.rs
@@ -11,7 +11,6 @@
 extern crate alloc;
 use alloc::boxed::Box;
 use core::cmp;
-use core::iter::Filter;
 
 use crate::tables::word::WordCat;
 
@@ -28,7 +27,7 @@ use crate::tables::word::WordCat;
 /// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
 pub struct UnicodeWords<'a> {
-    inner: Box<dyn Iterator<Item = &'a str> + 'a>,
+    inner: Box<dyn DoubleEndedIterator<Item = &'a str> + 'a>,
 }
 
 impl<'a> Iterator for UnicodeWords<'a> {
@@ -45,6 +44,13 @@ impl<'a> Iterator for UnicodeWords<'a> {
     }
 }
 
+impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
+    #[inline]
+    fn next_back(&mut self) -> Option<&'a str> {
+        self.inner.next_back()
+    }
+}
+
 /// An iterator over the substrings of a string which, after splitting the string on
 /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
 /// contain any characters with the
@@ -58,16 +64,15 @@ impl<'a> Iterator for UnicodeWords<'a> {
 ///
 /// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
-#[derive(Debug)]
 pub struct UnicodeWordIndices<'a> {
     #[allow(clippy::type_complexity)]
-    inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
+    inner: Box<dyn DoubleEndedIterator<Item = (usize, &'a str)> + 'a>,
 }
 
 impl<'a> Iterator for UnicodeWordIndices<'a> {
     type Item = (usize, &'a str);
 
-    #[inline]
+    #[inline(always)]
     fn next(&mut self) -> Option<(usize, &'a str)> {
         self.inner.next()
     }
@@ -722,12 +727,12 @@ impl<'a> AsciiWordBoundIter<'a> {
         AsciiWordBoundIter { rest: s, offset: 0 }
     }
 
-    #[inline(always)]
+    #[inline]
     fn is_core(b: u8) -> bool {
         b.is_ascii_alphanumeric() || b == b'_'
     }
 
-    #[inline(always)]
+    #[inline]
     fn is_infix(b: u8, prev: u8, next: u8) -> bool {
         match b {
             // numeric separators
@@ -744,6 +749,7 @@ impl<'a> AsciiWordBoundIter<'a> {
 impl<'a> Iterator for AsciiWordBoundIter<'a> {
     type Item = (usize, &'a str);
 
+    #[inline]
     fn next(&mut self) -> Option<Self::Item> {
         if self.rest.is_empty() {
             return None;
@@ -802,6 +808,66 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> {
     }
 }
 
+impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
+    fn next_back(&mut self) -> Option<(usize, &'a str)> {
+        let rest = self.rest;
+        if rest.is_empty() {
+            return None;
+        }
+        let bytes = rest.as_bytes();
+        let len = bytes.len();
+
+        // 1) Trailing spaces
+        if bytes[len - 1] == b' ' {
+            // find start of this last run of spaces
+            let mut start = len - 1;
+            while start > 0 && bytes[start - 1] == b' ' {
+                start -= 1;
+            }
+            let word = &rest[start..];
+            let pos = self.offset + start;
+            self.rest = &rest[..start];
+            return Some((pos, word));
+        }
+
+        // 2) Trailing core-run (letters/digits/underscore + infix)
+        if Self::is_core(bytes[len - 1]) {
+            // scan backwards as long as we see `is_core` or an `is_infix`
+            let mut start = len - 1;
+            while start > 0 {
+                let b = bytes[start - 1];
+                let prev = if start >= 2 { bytes[start - 2] } else { b };
+                let next = bytes[start]; // the byte we just included
+                if Self::is_core(b) || Self::is_infix(b, prev, next) {
+                    start -= 1;
+                } else {
+                    break;
+                }
+            }
+            let word = &rest[start..];
+            let pos = self.offset + start;
+            self.rest = &rest[..start];
+            return Some((pos, word));
+        }
+
+        // 3) CR+LF at end
+        if len >= 2 && bytes[len - 2] == b'\r' && bytes[len - 1] == b'\n' {
+            let start = len - 2;
+            let word = &rest[start..];
+            let pos = self.offset + start;
+            self.rest = &rest[..start];
+            return Some((pos, word));
+        }
+
+        // 4) Single non-core byte
+        let start = len - 1;
+        let word = &rest[start..];
+        let pos = self.offset + start;
+        self.rest = &rest[..start];
+        Some((pos, word))
+    }
+}
+
 #[inline]
 pub fn new_word_bounds(s: &str) -> UWordBounds<'_> {
     UWordBounds {
@@ -832,20 +898,25 @@ fn has_alphanumeric(s: &&str) -> bool {
 }
 
 #[inline]
-fn new_unicode_words_ascii<'a>(s: &'a str) -> impl Iterator<Item = &'a str> + 'a {
+fn has_ascii_alphanumeric(s: &&str) -> bool {
+    s.chars().any(|c| c.is_ascii_alphanumeric())
+}
+
+#[inline]
+fn new_unicode_words_ascii<'a>(s: &'a str) -> impl DoubleEndedIterator<Item = &'a str> + 'a {
     new_ascii_word_bound_indices(s)
         .map(|(_, w)| w)
-        .filter(|w| w.chars().any(|c| c.is_ascii_alphanumeric()))
+        .filter(has_ascii_alphanumeric)
 }
 
 #[inline]
-fn new_unicode_words_general<'a>(s: &'a str) -> impl Iterator<Item = &'a str> + 'a {
+fn new_unicode_words_general<'a>(s: &'a str) -> impl DoubleEndedIterator<Item = &'a str> + 'a {
     new_word_bounds(s).filter(has_alphanumeric)
 }
 
 #[inline]
 pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
-    let iter: Box<dyn Iterator<Item = &str>> = if s.is_ascii() {
+    let iter: Box<dyn DoubleEndedIterator<Item = &str>> = if s.is_ascii() {
         Box::new(new_unicode_words_ascii(s))
     } else {
         Box::new(new_unicode_words_general(s))
@@ -855,14 +926,13 @@ pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
 }
 
 #[inline]
-pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> {
-    use super::UnicodeSegmentation;
-
-    UnicodeWordIndices {
-        inner: s
-            .split_word_bound_indices()
-            .filter(|(_, c)| has_alphanumeric(c)),
-    }
+pub fn new_unicode_word_indices<'a>(s: &'a str) -> UnicodeWordIndices<'a> {
+    let iter: Box<dyn DoubleEndedIterator<Item = (usize, &str)>> = if s.is_ascii() {
+        Box::new(new_ascii_word_bound_indices(s).filter(|(_, w)| has_ascii_alphanumeric(w)))
+    } else {
+        Box::new(new_word_bound_indices(s).filter(|(_, w)| has_alphanumeric(w)))
+    };
+    UnicodeWordIndices { inner: iter }
 }
 
 #[cfg(test)]
@@ -921,5 +991,17 @@ mod tests {
 
             prop_assert_eq!(fast, uni);
         }
+
+        /// Fast path must equal general path for any ASCII input, forwards and backwards.
+        #[test]
+        fn proptest_ascii_matches_unicode_word_indices_rev(
+            // Vec<char> → String, length 0‒99
+            s in proptest::collection::vec(ascii_char(), 0..100)
+                   .prop_map(|v| v.into_iter().collect::<String>())
+        ) {
+            let fast_rev: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).rev().collect();
+            let uni_rev : Vec<(usize, &str)> = new_word_bound_indices(&s).rev().collect();
+            prop_assert_eq!(fast_rev, uni_rev);
+        }
     }
 }

From a3881da71bb3b4747713d149ef61444b91a43faf Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@datadoghq.com>
Date: Tue, 15 Jul 2025 20:40:35 +0800
Subject: [PATCH 07/13] restore test

---
 src/word.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/word.rs b/src/word.rs
index c835445..b6e042b 100644
--- a/src/word.rs
+++ b/src/word.rs
@@ -948,6 +948,13 @@ mod tests {
 
     #[test]
     fn test_syriac_abbr_mark() {
+        use crate::tables::word as wd;
+        let (_, _, cat) = wd::word_category('\u{70f}');
+        assert_eq!(cat, wd::WC_ALetter);
+    }
+
+    #[test]
+    fn test_end_of_ayah_cat() {
         use crate::tables::word as wd;
         let (_, _, cat) = wd::word_category('\u{6dd}');
         assert_eq!(cat, wd::WC_Numeric);

From 7599d624aa21b1bdd8bda7b1d12dfa37a2a690ef Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@datadoghq.com>
Date: Wed, 16 Jul 2025 20:15:20 +0800
Subject: [PATCH 08/13] replace Box with Enum

---
 src/word.rs | 135 +++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 91 insertions(+), 44 deletions(-)

diff --git a/src/word.rs b/src/word.rs
index b6e042b..f4bd9e1 100644
--- a/src/word.rs
+++ b/src/word.rs
@@ -9,7 +9,6 @@
 // except according to those terms.
 
 extern crate alloc;
-use alloc::boxed::Box;
 use core::cmp;
 
 use crate::tables::word::WordCat;
@@ -27,27 +26,33 @@ use crate::tables::word::WordCat;
 /// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
 pub struct UnicodeWords<'a> {
-    inner: Box<dyn DoubleEndedIterator<Item = &'a str> + 'a>,
+    inner: WordsIter<'a>,
 }
 
 impl<'a> Iterator for UnicodeWords<'a> {
     type Item = &'a str;
-
     #[inline]
-    fn next(&mut self) -> Option<&'a str> {
-        self.inner.next()
+    fn next(&mut self) -> Option<Self::Item> {
+        match &mut self.inner {
+            WordsIter::Ascii(i) => i.next(),
+            WordsIter::Unicode(i) => i.next(),
+        }
     }
-
     #[inline]
     fn size_hint(&self) -> (usize, Option<usize>) {
-        self.inner.size_hint()
+        match &self.inner {
+            WordsIter::Ascii(i) => i.size_hint(),
+            WordsIter::Unicode(i) => i.size_hint(),
+        }
     }
 }
-
 impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
     #[inline]
-    fn next_back(&mut self) -> Option<&'a str> {
-        self.inner.next_back()
+    fn next_back(&mut self) -> Option<Self::Item> {
+        match &mut self.inner {
+            WordsIter::Ascii(i) => i.next_back(),
+            WordsIter::Unicode(i) => i.next_back(),
+        }
     }
 }
 
@@ -65,27 +70,33 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
 /// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
 pub struct UnicodeWordIndices<'a> {
-    #[allow(clippy::type_complexity)]
-    inner: Box<dyn DoubleEndedIterator<Item = (usize, &'a str)> + 'a>,
+    inner: IndicesIter<'a>,
 }
 
 impl<'a> Iterator for UnicodeWordIndices<'a> {
     type Item = (usize, &'a str);
-
-    #[inline(always)]
-    fn next(&mut self) -> Option<(usize, &'a str)> {
-        self.inner.next()
+    #[inline]
+    fn next(&mut self) -> Option<Self::Item> {
+        match &mut self.inner {
+            IndicesIter::Ascii(i) => i.next(),
+            IndicesIter::Unicode(i) => i.next(),
+        }
     }
-
     #[inline]
     fn size_hint(&self) -> (usize, Option<usize>) {
-        self.inner.size_hint()
+        match &self.inner {
+            IndicesIter::Ascii(i) => i.size_hint(),
+            IndicesIter::Unicode(i) => i.size_hint(),
+        }
     }
 }
 impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
     #[inline]
-    fn next_back(&mut self) -> Option<(usize, &'a str)> {
-        self.inner.next_back()
+    fn next_back(&mut self) -> Option<Self::Item> {
+        match &mut self.inner {
+            IndicesIter::Ascii(i) => i.next_back(),
+            IndicesIter::Unicode(i) => i.next_back(),
+        }
     }
 }
 
@@ -868,6 +879,58 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
     }
 }
 
+#[inline]
+fn ascii_word_ok(t: &(usize, &str)) -> bool {
+    has_ascii_alphanumeric(&t.1)
+}
+#[inline]
+fn unicode_word_ok(t: &(usize, &str)) -> bool {
+    has_alphanumeric(&t.1)
+}
+
+type AsciiWordsIter<'a> = core::iter::Filter<
+    core::iter::Map<AsciiWordBoundIter<'a>, fn((usize, &'a str)) -> &'a str>,
+    fn(&&'a str) -> bool,
+>;
+
+type UnicodeWordsIter<'a> = core::iter::Filter<UWordBounds<'a>, fn(&&'a str) -> bool>;
+
+type AsciiIndicesIter<'a> =
+    core::iter::Filter<AsciiWordBoundIter<'a>, fn(&(usize, &'a str)) -> bool>;
+
+type UnicodeIndicesIter<'a> =
+    core::iter::Filter<UWordBoundIndices<'a>, fn(&(usize, &'a str)) -> bool>;
+
+enum WordsIter<'a> {
+    Ascii(AsciiWordsIter<'a>),
+    Unicode(UnicodeWordsIter<'a>),
+}
+
+enum IndicesIter<'a> {
+    Ascii(AsciiIndicesIter<'a>),
+    Unicode(UnicodeIndicesIter<'a>),
+}
+
+#[inline]
+pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
+    let inner = if s.is_ascii() {
+        WordsIter::Ascii(new_unicode_words_ascii(s))
+    } else {
+        WordsIter::Unicode(new_unicode_words_general(s))
+    };
+    UnicodeWords { inner }
+}
+
+#[inline]
+pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> {
+    let inner = if s.is_ascii() {
+        IndicesIter::Ascii(new_ascii_word_bound_indices(s).filter(ascii_word_ok))
+    } else {
+        IndicesIter::Unicode(new_word_bound_indices(s).filter(unicode_word_ok))
+    };
+    UnicodeWordIndices { inner }
+}
+
 #[inline]
 pub fn new_word_bounds(s: &str) -> UWordBounds<'_> {
     UWordBounds {
@@ -902,39 +965,23 @@ fn has_ascii_alphanumeric(s: &&str) -> bool {
     s.chars().any(|c| c.is_ascii_alphanumeric())
 }
 
+#[inline(always)]
+fn strip_pos((_, w): (usize, &str)) -> &str {
+    w
+}
+
 #[inline]
-fn new_unicode_words_ascii<'a>(s: &'a str) -> impl DoubleEndedIterator<Item = &'a str> + 'a {
+fn new_unicode_words_ascii<'a>(s: &'a str) -> AsciiWordsIter<'a> {
     new_ascii_word_bound_indices(s)
-        .map(|(_, w)| w)
+        .map(strip_pos as fn(_) -> _)
         .filter(has_ascii_alphanumeric)
 }
 
 #[inline]
-fn new_unicode_words_general<'a>(s: &'a str) -> impl DoubleEndedIterator<Item = &'a str> + 'a {
+fn new_unicode_words_general<'a>(s: &'a str) -> UnicodeWordsIter<'a> {
     new_word_bounds(s).filter(has_alphanumeric)
 }
 
-#[inline]
-pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
-    let iter: Box<dyn DoubleEndedIterator<Item = &str>> = if s.is_ascii() {
-        Box::new(new_unicode_words_ascii(s))
-    } else {
-        Box::new(new_unicode_words_general(s))
-    };
-
-    UnicodeWords { inner: iter }
-}
-
-#[inline]
-pub fn new_unicode_word_indices<'a>(s: &'a str) -> UnicodeWordIndices<'a> {
-    let iter: Box<dyn DoubleEndedIterator<Item = (usize, &str)>> = if s.is_ascii() {
-        Box::new(new_ascii_word_bound_indices(s).filter(|(_, w)| has_ascii_alphanumeric(w)))
-    } else {
-        Box::new(new_word_bound_indices(s).filter(|(_, w)| has_alphanumeric(w)))
-    };
-    UnicodeWordIndices { inner: iter }
-}
-
 #[cfg(test)]
 mod tests {
     use crate::word::{

From e29c432017fbb58b05b8703eba0766e9caa5b914 Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@datadoghq.com>
Date: Thu, 17 Jul 2025 13:04:37 +0800
Subject: [PATCH 09/13] add comments with reference to the spec

---
 src/word.rs | 61 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 50 insertions(+), 11 deletions(-)

diff --git a/src/word.rs b/src/word.rs
index f4bd9e1..b2d1c37 100644
--- a/src/word.rs
+++ b/src/word.rs
@@ -728,6 +728,31 @@ impl<'a> UWordBounds<'a> {
     }
 }
 
+/// ASCII‑fast‑path word‑boundary iterator for strings that contain only ASCII characters.
+///
+/// Since we handle only ASCII characters, we can use a much simpler set of
+/// word break values than the full Unicode algorithm.
+/// https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values
+///
+/// | Word_Break value | ASCII code points that belong to it                             |
+/// | -----------------| --------------------------------------------------------------- |
+/// | CR               | U+000D (CR)                                                     |
+/// | LF               | U+000A (LF)                                                     |
+/// | Newline          | U+000B (VT), U+000C (FF)                                        |
+/// | Single_Quote     | U+0027 (')                                                      |
+/// | Double_Quote     | U+0022 (")                                                      |
+/// | MidNumLet        | U+002E (.) FULL STOP                                            |
+/// | MidLetter        | U+003A (:) COLON                                                |
+/// | MidNum           | U+002C (,), U+003B (;)                                          |
+/// | Numeric          | U+0030 – U+0039 (0 … 9)                                         |
+/// | ALetter          | U+0041 – U+005A (A … Z), U+0061 – U+007A (a … z)                |
+/// | ExtendNumLet     | U+005F (_) underscore                                           |
+/// | WSegSpace        | U+0020 (SPACE)                                                  |
+///
+/// The macro MidNumLetQ boils down to: U+002E (.) FULL STOP and U+0027 (')
+/// AHLetter is the same as ALetter, so we don't need to distinguish it.
+///
+/// Any other single ASCII byte is its own boundary (the default WB999).
 pub struct AsciiWordBoundIter<'a> {
     rest: &'a str,
     offset: usize,
@@ -746,12 +771,17 @@ impl<'a> AsciiWordBoundIter<'a> {
     #[inline]
     fn is_infix(b: u8, prev: u8, next: u8) -> bool {
         match b {
-            // numeric separators
+            // Numeric separators such as "1,000" or "3.14" (WB11/WB12)
+            //
+            // "Numeric (MidNum | MidNumLetQ) Numeric"
             b'.' | b',' | b';' | b'\'' if prev.is_ascii_digit() && next.is_ascii_digit() => true,
-            // apostrophe in contractions
-            b'\'' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true,
-            // dot/colon inside letters
-            b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true,
+
+            // Dot or colon inside an alphabetic word ("e.g.", "http://") (WB6/WB7)
+            //
+            // "(MidLetter | MidNumLetQ) AHLetter (MidLetter | MidNumLetQ)"
+            // MidLetter  = b':'
+            // MidNumLetQ = b'.' | b'\''
+            b'\'' | b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true,
             _ => false,
         }
     }
@@ -769,7 +799,8 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> {
         let bytes = self.rest.as_bytes();
         let len = bytes.len();
 
-        // 1) Group runs of spaces
+        // 1) Keep horizontal whitespace together.
+        // Spec: WB3d joins adjacent *WSegSpace* into a single segment.
         if bytes[0] == b' ' {
             let mut i = 1;
             while i < len && bytes[i] == b' ' {
@@ -783,6 +814,7 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> {
         }
 
         // 2) Core-run (letters/digits/underscore + infix)
+        // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b)
         if Self::is_core(bytes[0]) {
             let mut i = 1;
             while i < len {
@@ -802,7 +834,8 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> {
             return Some((pos, word));
         }
 
-        // 3) Non-core: CR+LF as one token, otherwise single char
+        // 3) Do not break within CRLF.
+        // Spec: WB3 treats CR+LF as a single non‑breaking pair.
         if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' {
             let word = &self.rest[..2];
             let pos = self.offset;
@@ -810,6 +843,8 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> {
             self.offset += 2;
             Some((pos, word))
         } else {
+            // 4) Otherwise, break everywhere
+            // Spec: the catch‑all rule WB999.
             let word = &self.rest[..1];
             let pos = self.offset;
             self.rest = &self.rest[1..];
@@ -828,7 +863,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
         let bytes = rest.as_bytes();
         let len = bytes.len();
 
-        // 1) Trailing spaces
+        // 1) Group runs of spaces
+        // Spec: WB3d joins adjacent *WSegSpace* into a single segment.
         if bytes[len - 1] == b' ' {
             // find start of this last run of spaces
             let mut start = len - 1;
@@ -841,7 +877,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
             return Some((pos, word));
         }
 
-        // 2) Trailing core-run (letters/digits/underscore + infix)
+        // 2) Trailing Core-run (letters/digits/underscore + infix)
+        // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b)
         if Self::is_core(bytes[len - 1]) {
             // scan backwards as long as we see `is_core` or an `is_infix`
             let mut start = len - 1;
@@ -861,7 +898,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
             return Some((pos, word));
         }
 
-        // 3) CR+LF at end
+        // 3) Non-core: CR+LF as one token, otherwise single char
+        // Spec: WB3 treats CR+LF as a single non‑breaking pair.
         if len >= 2 && bytes[len - 2] == b'\r' && bytes[len - 1] == b'\n' {
             let start = len - 2;
             let word = &rest[start..];
@@ -870,7 +908,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> {
             return Some((pos, word));
         }
 
-        // 4) Single non-core byte
+        // 4) Fallback – every other byte is its own segment
+        // Spec: the catch‑all rule WB999.
         let start = len - 1;
         let word = &rest[start..];
         let pos = self.offset + start;

From 5a09f28848d33e3a960f68637ff2743cd085a3dd Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@datadoghq.com>
Date: Thu, 17 Jul 2025 13:18:59 +0800
Subject: [PATCH 10/13] remove unused alloc

---
 src/word.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/word.rs b/src/word.rs
index b2d1c37..aa6cbcd 100644
--- a/src/word.rs
+++ b/src/word.rs
@@ -8,7 +8,6 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
-extern crate alloc;
 use core::cmp;
 
 use crate::tables::word::WordCat;

From f76a997b57d19b829a9427e7f4d14f810dbfc3f8 Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@datadoghq.com>
Date: Thu, 17 Jul 2025 13:21:09 +0800
Subject: [PATCH 11/13] readd Debug derive

---
 src/word.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/word.rs b/src/word.rs
index aa6cbcd..4077e9f 100644
--- a/src/word.rs
+++ b/src/word.rs
@@ -24,6 +24,7 @@ use crate::tables::word::WordCat;
 ///
 /// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+#[derive(Debug)]
 pub struct UnicodeWords<'a> {
     inner: WordsIter<'a>,
 }
@@ -68,6 +69,7 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
 ///
 /// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
 /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
+#[derive(Debug)]
 pub struct UnicodeWordIndices<'a> {
     inner: IndicesIter<'a>,
 }
@@ -752,6 +754,7 @@ impl<'a> UWordBounds<'a> {
 /// AHLetter is the same as ALetter, so we don't need to distinguish it.
 ///
 /// Any other single ASCII byte is its own boundary (the default WB999).
+#[derive(Debug)]
 pub struct AsciiWordBoundIter<'a> {
     rest: &'a str,
     offset: usize,
@@ -939,11 +942,13 @@ type AsciiIndicesIter<'a> =
 type UnicodeIndicesIter<'a> =
     core::iter::Filter<UWordBoundIndices<'a>, fn(&(usize, &'a str)) -> bool>;
 
+#[derive(Debug)]
 enum WordsIter<'a> {
     Ascii(AsciiWordsIter<'a>),
     Unicode(UnicodeWordsIter<'a>),
 }
 
+#[derive(Debug)]
 enum IndicesIter<'a> {
     Ascii(AsciiIndicesIter<'a>),
     Unicode(UnicodeIndicesIter<'a>),

From b556333ca894f4a547d5b6fb1dca0ef9991ec973 Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@datadoghq.com>
Date: Thu, 17 Jul 2025 13:22:29 +0800
Subject: [PATCH 12/13] use import

---
 src/word.rs | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/src/word.rs b/src/word.rs
index 4077e9f..fdd128b 100644
--- a/src/word.rs
+++ b/src/word.rs
@@ -9,6 +9,7 @@
 // except according to those terms.
 
 use core::cmp;
+use core::iter::Filter;
 
 use crate::tables::word::WordCat;
 
@@ -929,18 +930,13 @@ fn unicode_word_ok(t: &(usize, &str)) -> bool {
     has_alphanumeric(&t.1)
 }
 
-type AsciiWordsIter<'a> = core::iter::Filter<
+type AsciiWordsIter<'a> = Filter<
     core::iter::Map<AsciiWordBoundIter<'a>, fn((usize, &'a str)) -> &'a str>,
     fn(&&'a str) -> bool,
 >;
-
-type UnicodeWordsIter<'a> = core::iter::Filter<UWordBounds<'a>, fn(&&'a str) -> bool>;
-
-type AsciiIndicesIter<'a> =
-    core::iter::Filter<AsciiWordBoundIter<'a>, fn(&(usize, &'a str)) -> bool>;
-
-type UnicodeIndicesIter<'a> =
-    core::iter::Filter<UWordBoundIndices<'a>, fn(&(usize, &'a str)) -> bool>;
+type UnicodeWordsIter<'a> = Filter<UWordBounds<'a>, fn(&&'a str) -> bool>;
+type AsciiIndicesIter<'a> = Filter<AsciiWordBoundIter<'a>, fn(&(usize, &'a str)) -> bool>;
+type UnicodeIndicesIter<'a> = Filter<UWordBoundIndices<'a>, fn(&(usize, &'a str)) -> bool>;
 
 #[derive(Debug)]
 enum WordsIter<'a> {

From 0e7674a40541b4baadfe59c65a28de68dcb8db40 Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@datadoghq.com>
Date: Fri, 18 Jul 2025 16:08:46 +0800
Subject: [PATCH 13/13] remove pub

---
 src/word.rs | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/word.rs b/src/word.rs
index fdd128b..1a46b39 100644
--- a/src/word.rs
+++ b/src/word.rs
@@ -756,7 +756,7 @@ impl<'a> UWordBounds<'a> {
 ///
 /// Any other single ASCII byte is its own boundary (the default WB999).
 #[derive(Debug)]
-pub struct AsciiWordBoundIter<'a> {
+struct AsciiWordBoundIter<'a> {
     rest: &'a str,
     offset: usize,
 }
@@ -988,7 +988,7 @@ pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> {
 }
 
 #[inline]
-pub fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> {
+fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> {
     AsciiWordBoundIter::new(s)
 }
 
@@ -1046,6 +1046,20 @@ mod tests {
         assert_eq!(cat, wd::WC_Numeric);
     }
 
+    #[test]
+    fn test_ascii_word_bound_indices_various_cases() {
+        let s = "Hello, world!";
+        let words: Vec<(usize, &str)> = new_ascii_word_bound_indices(s).collect();
+        let expected = vec![
+            (0, "Hello"), // simple letters
+            (5, ","),
+            (6, " "),     // space after comma
+            (7, "world"), // skip comma+space, stop at '!'
+            (12, "!"),    // punctuation at the end
+        ];
+        assert_eq!(words, expected);
+    }
+
     #[test]
     fn test_ascii_word_indices_various_cases() {
         let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com 127.0.0.1:9090";

<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Transitional//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'>
<html xmlns='http://www.w3.org/1999/xhtml'>
<head>
<title>pFad - Phonifier reborn</title>
<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />
</head>
<body>
<h1>Pfad - The Proxy pFad of &#169; 2024 Garber Painting. All rights reserved.</h1>


<!-- Disclaimer -->
<p>Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.</p>
<br>
<p>Alternative Proxies:</p><p><a href="http://rainy.clevelandohioweatherforecast.com/php-proxy/index.php?q=https://patch-diff.githubusercontent.com/raw/unicode-rs/unicode-segmentation/pull/147.patch" target="_blank">Alternative Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/index.php?u=https://patch-diff.githubusercontent.com/raw/unicode-rs/unicode-segmentation/pull/147.patch" target="_blank">pFad Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/v3index.php?u=https://patch-diff.githubusercontent.com/raw/unicode-rs/unicode-segmentation/pull/147.patch" target="_blank">pFad v3 Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/v4index.php?u=https://patch-diff.githubusercontent.com/raw/unicode-rs/unicode-segmentation/pull/147.patch" target="_blank">pFad v4 Proxy</a></p></body>
</html>