From 592d99d3388ea0764bebb621346c7b292d2e464b Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 11 Jul 2025 12:12:22 +0800 Subject: [PATCH 01/13] add benchmark --- Cargo.toml | 6 ++++++ benches/chars.rs | 4 ++-- benches/unicode_word_indices.rs | 37 +++++++++++++++++++++++++++++++++ benches/word_bounds.rs | 2 +- benches/words.rs | 4 ++-- 5 files changed, 48 insertions(+), 5 deletions(-) create mode 100644 benches/unicode_word_indices.rs diff --git a/Cargo.toml b/Cargo.toml index 404f053..1aac6ea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ no_std = [] # This is a no-op, preserved for backward compatibility only. [dev-dependencies] quickcheck = "0.7" criterion = "0.5" +proptest = "1.7.0" [[bench]] name = "chars" @@ -36,3 +37,8 @@ harness = false [[bench]] name = "word_bounds" harness = false + +[[bench]] +name = "unicode_word_indices" +harness = false + diff --git a/benches/chars.rs b/benches/chars.rs index bacffa1..2654a26 100644 --- a/benches/chars.rs +++ b/benches/chars.rs @@ -41,7 +41,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("grapheme", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(), |b, content| b.iter(|| grapheme(content)), ); } @@ -49,7 +49,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("scalar", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(), |b, content| b.iter(|| scalar(content)), ); } diff --git a/benches/unicode_word_indices.rs b/benches/unicode_word_indices.rs new file mode 100644 index 0000000..4c09404 --- /dev/null +++ b/benches/unicode_word_indices.rs @@ -0,0 +1,37 @@ +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; + +use std::fs; +use unicode_segmentation::UnicodeSegmentation; + +const FILES: &[&str] = &[ + "log", //"arabic", + "english", + //"hindi", + "japanese", + //"korean", + //"mandarin", + //"russian", + //"source_code", +]; + +#[inline(always)] +fn grapheme(text: &str) { + for w in text.unicode_word_indices() { + black_box(w); + } +} + +fn bench_all(c: &mut Criterion) { + let mut group = c.benchmark_group("unicode_word_indices"); + + for file in FILES { + let input = fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(); + group.throughput(criterion::Throughput::Bytes(input.len() as u64)); + group.bench_with_input(BenchmarkId::from_parameter(file), &input, |b, content| { + b.iter(|| grapheme(content)) + }); + } +} + +criterion_group!(benches, bench_all); +criterion_main!(benches); diff --git a/benches/word_bounds.rs b/benches/word_bounds.rs index 42d50ff..f1af7c4 100644 --- a/benches/word_bounds.rs +++ b/benches/word_bounds.rs @@ -27,7 +27,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("grapheme", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt",)).unwrap(), |b, content| b.iter(|| grapheme(content)), ); } diff --git a/benches/words.rs b/benches/words.rs index 86785d5..508bc9f 100644 --- a/benches/words.rs +++ b/benches/words.rs @@ -41,7 +41,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("grapheme", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(), |b, content| b.iter(|| grapheme(content)), ); } @@ -49,7 +49,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("scalar", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(), |b, content| b.iter(|| scalar(content)), ); } From eca90432d2943af1d5040d61b39e05a16780949e Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 11 Jul 2025 13:08:47 +0800 Subject: [PATCH 02/13] add ascii fastpath --- src/lib.rs | 25 +++-- src/word.rs | 306 +++++++++++++++++++++++++++++++--------------------- 2 files changed, 196 insertions(+), 135 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c8ec5b5..7672eb2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -56,11 +56,14 @@ )] #![no_std] +#[cfg(test)] +extern crate std; + pub use grapheme::{GraphemeCursor, GraphemeIncomplete}; pub use grapheme::{GraphemeIndices, Graphemes}; pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences}; pub use tables::UNICODE_VERSION; -pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords}; +pub use word::{UWordBoundIndices, UWordBounds}; mod grapheme; mod sentence; @@ -133,7 +136,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&uw1[..], b); /// ``` - fn unicode_words(&self) -> UnicodeWords<'_>; + fn unicode_words(&self) -> impl Iterator; /// Returns an iterator over the words of `self`, separated on /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their @@ -157,7 +160,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&uwi1[..], b); /// ``` - fn unicode_word_indices(&self) -> UnicodeWordIndices<'_>; + fn unicode_word_indices(&self) -> impl Iterator; /// Returns an iterator over substrings of `self` separated on /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). @@ -173,7 +176,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&swu1[..], b); /// ``` - fn split_word_bounds(&self) -> UWordBounds<'_>; + fn split_word_bounds(&self) -> impl DoubleEndedIterator; /// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries, /// and their offsets. See `split_word_bounds()` for more information. @@ -188,7 +191,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&swi1[..], b); /// ``` - fn split_word_bound_indices(&self) -> UWordBoundIndices<'_>; + fn split_word_bound_indices(&self) -> impl DoubleEndedIterator; /// Returns an iterator over substrings of `self` separated on /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). @@ -210,7 +213,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&us1[..], b); /// ``` - fn unicode_sentences(&self) -> UnicodeSentences<'_>; + fn unicode_sentences(&self) -> impl Iterator; /// Returns an iterator over substrings of `self` separated on /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). @@ -258,27 +261,27 @@ impl UnicodeSegmentation for str { } #[inline] - fn unicode_words(&self) -> UnicodeWords { + fn unicode_words(&self) -> impl Iterator { word::new_unicode_words(self) } #[inline] - fn unicode_word_indices(&self) -> UnicodeWordIndices { + fn unicode_word_indices(&self) -> impl Iterator { word::new_unicode_word_indices(self) } #[inline] - fn split_word_bounds(&self) -> UWordBounds { + fn split_word_bounds(&self) -> impl DoubleEndedIterator { word::new_word_bounds(self) } #[inline] - fn split_word_bound_indices(&self) -> UWordBoundIndices { + fn split_word_bound_indices(&self) -> impl DoubleEndedIterator { word::new_word_bound_indices(self) } #[inline] - fn unicode_sentences(&self) -> UnicodeSentences { + fn unicode_sentences(&self) -> impl Iterator { sentence::new_unicode_sentences(self) } diff --git a/src/word.rs b/src/word.rs index b2a85ae..964cdc0 100644 --- a/src/word.rs +++ b/src/word.rs @@ -9,85 +9,11 @@ // except according to those terms. use core::cmp; -use core::iter::Filter; -use crate::tables::word::WordCat; - -/// An iterator over the substrings of a string which, after splitting the string on -/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), -/// contain any characters with the -/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) -/// property, or with -/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). -/// -/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See -/// its documentation for more. -/// -/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words -/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html -#[derive(Debug)] -pub struct UnicodeWords<'a> { - inner: Filter, fn(&&str) -> bool>, -} - -impl<'a> Iterator for UnicodeWords<'a> { - type Item = &'a str; - - #[inline] - fn next(&mut self) -> Option<&'a str> { - self.inner.next() - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.inner.size_hint() - } -} -impl<'a> DoubleEndedIterator for UnicodeWords<'a> { - #[inline] - fn next_back(&mut self) -> Option<&'a str> { - self.inner.next_back() - } -} - -/// An iterator over the substrings of a string which, after splitting the string on -/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), -/// contain any characters with the -/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) -/// property, or with -/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). -/// This iterator also provides the byte offsets for each substring. -/// -/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See -/// its documentation for more. -/// -/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices -/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html -#[derive(Debug)] -pub struct UnicodeWordIndices<'a> { - #[allow(clippy::type_complexity)] - inner: Filter, fn(&(usize, &str)) -> bool>, -} +extern crate alloc; +use alloc::boxed::Box; -impl<'a> Iterator for UnicodeWordIndices<'a> { - type Item = (usize, &'a str); - - #[inline] - fn next(&mut self) -> Option<(usize, &'a str)> { - self.inner.next() - } - - #[inline] - fn size_hint(&self) -> (usize, Option) { - self.inner.size_hint() - } -} -impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> { - #[inline] - fn next_back(&mut self) -> Option<(usize, &'a str)> { - self.inner.next_back() - } -} +use crate::tables::word::WordCat; /// External iterator for a string's /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). @@ -117,24 +43,6 @@ pub struct UWordBoundIndices<'a> { iter: UWordBounds<'a>, } -impl<'a> UWordBoundIndices<'a> { - #[inline] - /// View the underlying data (the part yet to be iterated) as a slice of the original string. - /// - /// ```rust - /// # use unicode_segmentation::UnicodeSegmentation; - /// let mut iter = "Hello world".split_word_bound_indices(); - /// assert_eq!(iter.as_str(), "Hello world"); - /// iter.next(); - /// assert_eq!(iter.as_str(), " world"); - /// iter.next(); - /// assert_eq!(iter.as_str(), "world"); - /// ``` - pub fn as_str(&self) -> &'a str { - self.iter.as_str() - } -} - impl<'a> Iterator for UWordBoundIndices<'a> { type Item = (usize, &'a str); @@ -677,22 +585,6 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> { } impl<'a> UWordBounds<'a> { - #[inline] - /// View the underlying data (the part yet to be iterated) as a slice of the original string. - /// - /// ```rust - /// # use unicode_segmentation::UnicodeSegmentation; - /// let mut iter = "Hello world".split_word_bounds(); - /// assert_eq!(iter.as_str(), "Hello world"); - /// iter.next(); - /// assert_eq!(iter.as_str(), " world"); - /// iter.next(); - /// assert_eq!(iter.as_str(), "world"); - /// ``` - pub fn as_str(&self) -> &'a str { - self.string - } - #[inline] fn get_next_cat(&self, idx: usize) -> Option { use crate::tables::word as wd; @@ -736,33 +628,161 @@ pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> { #[inline] fn has_alphanumeric(s: &&str) -> bool { - use crate::tables::util::is_alphanumeric; - - s.chars().any(is_alphanumeric) + s.chars().any(|c| c.is_alphanumeric()) } #[inline] -pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { - use super::UnicodeSegmentation; +fn has_ascii_alphanumeric(s: &&str) -> bool { + s.bytes().any(|b| b.is_ascii_alphanumeric()) +} + +/// Fast-path for ASCII-only word segmentation, matching `unicode-segmentation` on pure ASCII: +/// • runs of ASCII spaces are grouped (`" "`) +/// • core-runs (letters, digits, underscore + infix) +/// • any other ASCII char emits as one token, except CR+LF emits as a single two-char token +pub fn new_ascii_word_bound_indices<'a>(s: &'a str) -> impl Iterator + 'a { + #[inline(always)] + fn is_core(b: u8) -> bool { + b.is_ascii_alphanumeric() || b == b'_' + } + #[inline(always)] + fn is_infix(b: u8, prev: u8, next: u8) -> bool { + match b { + // numeric separators + b'.' | b',' | b';' | b'\'' if prev.is_ascii_digit() && next.is_ascii_digit() => true, + // apostrophe in contractions + b'\'' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true, + // dot/colon inside letters + b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true, + _ => false, + } + } + + use core::iter::from_fn; + let mut rest = s; + let mut offset = 0; + + from_fn(move || { + if rest.is_empty() { + return None; + } + let bytes = rest.as_bytes(); + let len = bytes.len(); + + // 1) Group runs of spaces + if bytes[0] == b' ' { + let mut i = 1; + while i < len && bytes[i] == b' ' { + i += 1; + } + let word = &rest[..i]; + let pos = offset; + rest = &rest[i..]; + offset += i; + return Some((pos, word)); + } - UnicodeWords { - inner: s.split_word_bounds().filter(has_alphanumeric), + // 2) Core-run (letters/digits/underscore + infix) + if is_core(bytes[0]) { + let mut i = 1; + while i < len { + let b = bytes[i]; + if is_core(b) || (i + 1 < len && is_infix(b, bytes[i - 1], bytes[i + 1])) { + i += 1; + } else { + break; + } + } + let word = &rest[..i]; + let pos = offset; + rest = &rest[i..]; + offset += i; + return Some((pos, word)); + } + + // 3) Non-core: CR+LF as one token, otherwise single char + if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' { + let word = &rest[..2]; + let pos = offset; + rest = &rest[2..]; + offset += 2; + Some((pos, word)) + } else { + // emit exactly one byte (whitespace/control/punct) + let word = &rest[..1]; + let pos = offset; + rest = &rest[1..]; + offset += 1; + Some((pos, word)) + } + }) +} +/// An iterator over the substrings of a string which, after splitting the string on +/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), +/// contain any characters with the +/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) +/// property, or with +/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). +/// +/// This method is accessed by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See +/// its documentation for more. +/// +/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words +/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html +#[inline] +pub(crate) fn new_unicode_words(s: &str) -> Box + '_> { + if s.is_ascii() { + Box::new(new_unicode_words_ascii(s)) + } else { + Box::new(new_unicode_words_general(s)) } } #[inline] -pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> { - use super::UnicodeSegmentation; +fn new_unicode_words_ascii<'a>(s: &'a str) -> impl Iterator + 'a { + new_ascii_word_bound_indices(s) + .map(|(_, w)| w) + .filter(|w| w.chars().any(|c| c.is_ascii_alphanumeric())) +} + +#[inline] +fn new_unicode_words_general<'a>(s: &'a str) -> impl Iterator + 'a { + new_word_bounds(s).filter(has_alphanumeric) +} - UnicodeWordIndices { - inner: s - .split_word_bound_indices() - .filter(|(_, c)| has_alphanumeric(c)), +/// An iterator over the substrings of a string which, after splitting the string on +/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), +/// contain any characters with the +/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) +/// property, or with +/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). +/// This iterator also provides the byte offsets for each substring. +/// +/// This method is accessed by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See +/// its documentation for more. +/// +/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices +/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html +#[inline] +pub fn new_unicode_word_indices<'a>(s: &'a str) -> Box + 'a> { + if s.is_ascii() { + Box::new(new_ascii_word_bound_indices(s).filter(|(_, w)| has_ascii_alphanumeric(w))) + } else { + Box::new(new_word_bound_indices(s).filter(|(_, w)| has_alphanumeric(w))) } } #[cfg(test)] mod tests { + use crate::word::{ + new_ascii_word_bound_indices, new_unicode_words_ascii, new_word_bound_indices, + }; + use std::string::String; + use std::vec::Vec; + use std::{format, vec}; + + use proptest::prelude::*; + #[test] fn test_syriac_abbr_mark() { use crate::tables::word as wd; @@ -776,4 +796,42 @@ mod tests { let (_, _, cat) = wd::word_category('\u{6dd}'); assert_eq!(cat, wd::WC_Numeric); } + + #[test] + fn test_ascii_word_indices_various_cases() { + let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com"; + let words: Vec<&str> = new_unicode_words_ascii(s).collect(); + let expected = vec![ + ("Hello"), // simple letters + ("world"), // skip comma+space, stop at '!' + ("can't"), // apostrophe joins letters + ("e.g"), + ("var1"), + ("123,456"), // digits+comma+digits + ("foo_bar"), + ("example.com"), + ]; + assert_eq!(words, expected); + } + + /// Strategy that yields every code-point from NUL (0) to DEL (127). + fn ascii_char() -> impl Strategy { + (0u8..=127).prop_map(|b| b as char) + } + + proptest! { + #![proptest_config(ProptestConfig::with_cases(10000))] + /// Fast path must equal general path for any ASCII input. + #[test] + fn proptest_ascii_matches_unicode_word_indices( + // Vec → String, length 0‒99 + s in proptest::collection::vec(ascii_char(), 0..100) + .prop_map(|v| v.into_iter().collect::()) + ) { + let fast: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).collect(); + let uni: Vec<(usize, &str)> = new_word_bound_indices(&s).collect(); + + prop_assert_eq!(fast, uni); + } + } } From b5ed407d84bdac57ffae8f968973bf33496a0326 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 11 Jul 2025 14:27:30 +0800 Subject: [PATCH 03/13] add test case IP --- src/word.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/word.rs b/src/word.rs index 964cdc0..c690eb6 100644 --- a/src/word.rs +++ b/src/word.rs @@ -799,7 +799,7 @@ mod tests { #[test] fn test_ascii_word_indices_various_cases() { - let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com"; + let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com 127.0.0.1:9090"; let words: Vec<&str> = new_unicode_words_ascii(s).collect(); let expected = vec![ ("Hello"), // simple letters @@ -810,6 +810,8 @@ mod tests { ("123,456"), // digits+comma+digits ("foo_bar"), ("example.com"), + ("127.0.0.1"), + ("9090"), // port number ]; assert_eq!(words, expected); } From 9b1b7f998451d87c0fb8481c756778a4961f122c Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Sun, 13 Jul 2025 18:30:30 +0800 Subject: [PATCH 04/13] add log to benches --- benches/texts/log.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 benches/texts/log.txt diff --git a/benches/texts/log.txt b/benches/texts/log.txt new file mode 100644 index 0000000..e18ca32 --- /dev/null +++ b/benches/texts/log.txt @@ -0,0 +1 @@ +2018-07-12 13:59:01 UTC | ERROR | (worker.go:131 in process) | Too many errors for endpoint 'dummy/api/v1/check_run?api_key=*************************00000': retrying later From 6f96a23b0234a078e05028563198d8c04c48b2e5 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 15 Jul 2025 19:55:08 +0800 Subject: [PATCH 05/13] restore iterators --- src/lib.rs | 18 ++-- src/word.rs | 280 ++++++++++++++++++++++++++++++++++------------------ 2 files changed, 193 insertions(+), 105 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 7672eb2..1dbdd73 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -65,6 +65,8 @@ pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences}; pub use tables::UNICODE_VERSION; pub use word::{UWordBoundIndices, UWordBounds}; +use crate::word::{UnicodeWordIndices, UnicodeWords}; + mod grapheme; mod sentence; #[rustfmt::skip] @@ -136,7 +138,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&uw1[..], b); /// ``` - fn unicode_words(&self) -> impl Iterator; + fn unicode_words(&self) -> UnicodeWords; /// Returns an iterator over the words of `self`, separated on /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their @@ -160,7 +162,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&uwi1[..], b); /// ``` - fn unicode_word_indices(&self) -> impl Iterator; + fn unicode_word_indices(&self) -> UnicodeWordIndices; /// Returns an iterator over substrings of `self` separated on /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). @@ -176,7 +178,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&swu1[..], b); /// ``` - fn split_word_bounds(&self) -> impl DoubleEndedIterator; + fn split_word_bounds(&self) -> UWordBounds; /// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries, /// and their offsets. See `split_word_bounds()` for more information. @@ -191,7 +193,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&swi1[..], b); /// ``` - fn split_word_bound_indices(&self) -> impl DoubleEndedIterator; + fn split_word_bound_indices(&self) -> UWordBoundIndices; /// Returns an iterator over substrings of `self` separated on /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). @@ -261,22 +263,22 @@ impl UnicodeSegmentation for str { } #[inline] - fn unicode_words(&self) -> impl Iterator { + fn unicode_words(&self) -> UnicodeWords { word::new_unicode_words(self) } #[inline] - fn unicode_word_indices(&self) -> impl Iterator { + fn unicode_word_indices(&self) -> UnicodeWordIndices { word::new_unicode_word_indices(self) } #[inline] - fn split_word_bounds(&self) -> impl DoubleEndedIterator { + fn split_word_bounds(&self) -> UWordBounds { word::new_word_bounds(self) } #[inline] - fn split_word_bound_indices(&self) -> impl DoubleEndedIterator { + fn split_word_bound_indices(&self) -> UWordBoundIndices { word::new_word_bound_indices(self) } diff --git a/src/word.rs b/src/word.rs index c690eb6..1d1d69c 100644 --- a/src/word.rs +++ b/src/word.rs @@ -8,13 +8,82 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use core::cmp; - extern crate alloc; use alloc::boxed::Box; +use core::cmp; +use core::iter::Filter; use crate::tables::word::WordCat; +/// An iterator over the substrings of a string which, after splitting the string on +/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), +/// contain any characters with the +/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) +/// property, or with +/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). +/// +/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See +/// its documentation for more. +/// +/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words +/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html +pub struct UnicodeWords<'a> { + inner: Box + 'a>, +} + +impl<'a> Iterator for UnicodeWords<'a> { + type Item = &'a str; + + #[inline] + fn next(&mut self) -> Option<&'a str> { + self.inner.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} + +/// An iterator over the substrings of a string which, after splitting the string on +/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), +/// contain any characters with the +/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) +/// property, or with +/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). +/// This iterator also provides the byte offsets for each substring. +/// +/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See +/// its documentation for more. +/// +/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices +/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html +#[derive(Debug)] +pub struct UnicodeWordIndices<'a> { + #[allow(clippy::type_complexity)] + inner: Filter, fn(&(usize, &str)) -> bool>, +} + +impl<'a> Iterator for UnicodeWordIndices<'a> { + type Item = (usize, &'a str); + + #[inline] + fn next(&mut self) -> Option<(usize, &'a str)> { + self.inner.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + self.inner.size_hint() + } +} +impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> { + #[inline] + fn next_back(&mut self) -> Option<(usize, &'a str)> { + self.inner.next_back() + } +} + /// External iterator for a string's /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). /// @@ -43,6 +112,24 @@ pub struct UWordBoundIndices<'a> { iter: UWordBounds<'a>, } +impl<'a> UWordBoundIndices<'a> { + #[inline] + /// View the underlying data (the part yet to be iterated) as a slice of the original string. + /// + /// ```rust + /// # use unicode_segmentation::UnicodeSegmentation; + /// let mut iter = "Hello world".split_word_bound_indices(); + /// assert_eq!(iter.as_str(), "Hello world"); + /// iter.next(); + /// assert_eq!(iter.as_str(), " world"); + /// iter.next(); + /// assert_eq!(iter.as_str(), "world"); + /// ``` + pub fn as_str(&self) -> &'a str { + self.iter.as_str() + } +} + impl<'a> Iterator for UWordBoundIndices<'a> { type Item = (usize, &'a str); @@ -585,6 +672,22 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> { } impl<'a> UWordBounds<'a> { + #[inline] + /// View the underlying data (the part yet to be iterated) as a slice of the original string. + /// + /// ```rust + /// # use unicode_segmentation::UnicodeSegmentation; + /// let mut iter = "Hello world".split_word_bounds(); + /// assert_eq!(iter.as_str(), "Hello world"); + /// iter.next(); + /// assert_eq!(iter.as_str(), " world"); + /// iter.next(); + /// assert_eq!(iter.as_str(), "world"); + /// ``` + pub fn as_str(&self) -> &'a str { + self.string + } + #[inline] fn get_next_cat(&self, idx: usize) -> Option { use crate::tables::word as wd; @@ -609,42 +712,21 @@ impl<'a> UWordBounds<'a> { } } -#[inline] -pub fn new_word_bounds(s: &str) -> UWordBounds<'_> { - UWordBounds { - string: s, - cat: None, - catb: None, - } +pub struct AsciiWordBoundIter<'a> { + rest: &'a str, + offset: usize, } -#[inline] -pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> { - UWordBoundIndices { - start_offset: s.as_ptr() as usize, - iter: new_word_bounds(s), +impl<'a> AsciiWordBoundIter<'a> { + pub fn new(s: &'a str) -> Self { + AsciiWordBoundIter { rest: s, offset: 0 } } -} - -#[inline] -fn has_alphanumeric(s: &&str) -> bool { - s.chars().any(|c| c.is_alphanumeric()) -} -#[inline] -fn has_ascii_alphanumeric(s: &&str) -> bool { - s.bytes().any(|b| b.is_ascii_alphanumeric()) -} - -/// Fast-path for ASCII-only word segmentation, matching `unicode-segmentation` on pure ASCII: -/// • runs of ASCII spaces are grouped (`" "`) -/// • core-runs (letters, digits, underscore + infix) -/// • any other ASCII char emits as one token, except CR+LF emits as a single two-char token -pub fn new_ascii_word_bound_indices<'a>(s: &'a str) -> impl Iterator + 'a { #[inline(always)] fn is_core(b: u8) -> bool { b.is_ascii_alphanumeric() || b == b'_' } + #[inline(always)] fn is_infix(b: u8, prev: u8, next: u8) -> bool { match b { @@ -657,16 +739,17 @@ pub fn new_ascii_word_bound_indices<'a>(s: &'a str) -> impl Iterator false, } } +} - use core::iter::from_fn; - let mut rest = s; - let mut offset = 0; +impl<'a> Iterator for AsciiWordBoundIter<'a> { + type Item = (usize, &'a str); - from_fn(move || { - if rest.is_empty() { + fn next(&mut self) -> Option { + if self.rest.is_empty() { return None; } - let bytes = rest.as_bytes(); + + let bytes = self.rest.as_bytes(); let len = bytes.len(); // 1) Group runs of spaces @@ -675,69 +758,79 @@ pub fn new_ascii_word_bound_indices<'a>(s: &'a str) -> impl Iterator= 2 && bytes[1] == b'\n' { - let word = &rest[..2]; - let pos = offset; - rest = &rest[2..]; - offset += 2; + let word = &self.rest[..2]; + let pos = self.offset; + self.rest = &self.rest[2..]; + self.offset += 2; Some((pos, word)) } else { - // emit exactly one byte (whitespace/control/punct) - let word = &rest[..1]; - let pos = offset; - rest = &rest[1..]; - offset += 1; + let word = &self.rest[..1]; + let pos = self.offset; + self.rest = &self.rest[1..]; + self.offset += 1; Some((pos, word)) } - }) + } } -/// An iterator over the substrings of a string which, after splitting the string on -/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), -/// contain any characters with the -/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) -/// property, or with -/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). -/// -/// This method is accessed by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See -/// its documentation for more. -/// -/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words -/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html + #[inline] -pub(crate) fn new_unicode_words(s: &str) -> Box + '_> { - if s.is_ascii() { - Box::new(new_unicode_words_ascii(s)) - } else { - Box::new(new_unicode_words_general(s)) +pub fn new_word_bounds(s: &str) -> UWordBounds<'_> { + UWordBounds { + string: s, + cat: None, + catb: None, + } +} + +#[inline] +pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> { + UWordBoundIndices { + start_offset: s.as_ptr() as usize, + iter: new_word_bounds(s), } } +#[inline] +pub fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> { + AsciiWordBoundIter::new(s) +} + +#[inline] +fn has_alphanumeric(s: &&str) -> bool { + use crate::tables::util::is_alphanumeric; + + s.chars().any(is_alphanumeric) +} + #[inline] fn new_unicode_words_ascii<'a>(s: &'a str) -> impl Iterator + 'a { new_ascii_word_bound_indices(s) @@ -750,25 +843,25 @@ fn new_unicode_words_general<'a>(s: &'a str) -> impl Iterator + new_word_bounds(s).filter(has_alphanumeric) } -/// An iterator over the substrings of a string which, after splitting the string on -/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), -/// contain any characters with the -/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) -/// property, or with -/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). -/// This iterator also provides the byte offsets for each substring. -/// -/// This method is accessed by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See -/// its documentation for more. -/// -/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices -/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html #[inline] -pub fn new_unicode_word_indices<'a>(s: &'a str) -> Box + 'a> { - if s.is_ascii() { - Box::new(new_ascii_word_bound_indices(s).filter(|(_, w)| has_ascii_alphanumeric(w))) +pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { + let iter: Box> = if s.is_ascii() { + Box::new(new_unicode_words_ascii(s)) } else { - Box::new(new_word_bound_indices(s).filter(|(_, w)| has_alphanumeric(w))) + Box::new(new_unicode_words_general(s)) + }; + + UnicodeWords { inner: iter } +} + +#[inline] +pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> { + use super::UnicodeSegmentation; + + UnicodeWordIndices { + inner: s + .split_word_bound_indices() + .filter(|(_, c)| has_alphanumeric(c)), } } @@ -785,13 +878,6 @@ mod tests { #[test] fn test_syriac_abbr_mark() { - use crate::tables::word as wd; - let (_, _, cat) = wd::word_category('\u{70f}'); - assert_eq!(cat, wd::WC_ALetter); - } - - #[test] - fn test_end_of_ayah_cat() { use crate::tables::word as wd; let (_, _, cat) = wd::word_category('\u{6dd}'); assert_eq!(cat, wd::WC_Numeric); From 7beb8a60b73dbeecd42d16bb8f32cdfb0da2a9e3 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 15 Jul 2025 20:15:55 +0800 Subject: [PATCH 06/13] add backwards iterator --- src/lib.rs | 24 +++++------ src/word.rs | 120 +++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 113 insertions(+), 31 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 1dbdd73..d15ac0b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -138,7 +138,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&uw1[..], b); /// ``` - fn unicode_words(&self) -> UnicodeWords; + fn unicode_words(&self) -> UnicodeWords<'_>; /// Returns an iterator over the words of `self`, separated on /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their @@ -162,7 +162,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&uwi1[..], b); /// ``` - fn unicode_word_indices(&self) -> UnicodeWordIndices; + fn unicode_word_indices(&self) -> UnicodeWordIndices<'_>; /// Returns an iterator over substrings of `self` separated on /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). @@ -178,7 +178,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&swu1[..], b); /// ``` - fn split_word_bounds(&self) -> UWordBounds; + fn split_word_bounds(&self) -> UWordBounds<'_>; /// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries, /// and their offsets. See `split_word_bounds()` for more information. @@ -193,7 +193,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&swi1[..], b); /// ``` - fn split_word_bound_indices(&self) -> UWordBoundIndices; + fn split_word_bound_indices(&self) -> UWordBoundIndices<'_>; /// Returns an iterator over substrings of `self` separated on /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). @@ -215,7 +215,7 @@ pub trait UnicodeSegmentation { /// /// assert_eq!(&us1[..], b); /// ``` - fn unicode_sentences(&self) -> impl Iterator; + fn unicode_sentences(&self) -> UnicodeSentences<'_>; /// Returns an iterator over substrings of `self` separated on /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). @@ -253,7 +253,7 @@ pub trait UnicodeSegmentation { impl UnicodeSegmentation for str { #[inline] - fn graphemes(&self, is_extended: bool) -> Graphemes { + fn graphemes(&self, is_extended: bool) -> Graphemes<'_> { grapheme::new_graphemes(self, is_extended) } @@ -263,32 +263,32 @@ impl UnicodeSegmentation for str { } #[inline] - fn unicode_words(&self) -> UnicodeWords { + fn unicode_words(&self) -> UnicodeWords<'_> { word::new_unicode_words(self) } #[inline] - fn unicode_word_indices(&self) -> UnicodeWordIndices { + fn unicode_word_indices(&self) -> UnicodeWordIndices<'_> { word::new_unicode_word_indices(self) } #[inline] - fn split_word_bounds(&self) -> UWordBounds { + fn split_word_bounds(&self) -> UWordBounds<'_> { word::new_word_bounds(self) } #[inline] - fn split_word_bound_indices(&self) -> UWordBoundIndices { + fn split_word_bound_indices(&self) -> UWordBoundIndices<'_> { word::new_word_bound_indices(self) } #[inline] - fn unicode_sentences(&self) -> impl Iterator { + fn unicode_sentences(&self) -> UnicodeSentences<'_> { sentence::new_unicode_sentences(self) } #[inline] - fn split_sentence_bounds(&self) -> USentenceBounds { + fn split_sentence_bounds(&self) -> USentenceBounds<'_> { sentence::new_sentence_bounds(self) } diff --git a/src/word.rs b/src/word.rs index 1d1d69c..c835445 100644 --- a/src/word.rs +++ b/src/word.rs @@ -11,7 +11,6 @@ extern crate alloc; use alloc::boxed::Box; use core::cmp; -use core::iter::Filter; use crate::tables::word::WordCat; @@ -28,7 +27,7 @@ use crate::tables::word::WordCat; /// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html pub struct UnicodeWords<'a> { - inner: Box + 'a>, + inner: Box + 'a>, } impl<'a> Iterator for UnicodeWords<'a> { @@ -45,6 +44,13 @@ impl<'a> Iterator for UnicodeWords<'a> { } } +impl<'a> DoubleEndedIterator for UnicodeWords<'a> { + #[inline] + fn next_back(&mut self) -> Option<&'a str> { + self.inner.next_back() + } +} + /// An iterator over the substrings of a string which, after splitting the string on /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), /// contain any characters with the @@ -58,16 +64,15 @@ impl<'a> Iterator for UnicodeWords<'a> { /// /// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html -#[derive(Debug)] pub struct UnicodeWordIndices<'a> { #[allow(clippy::type_complexity)] - inner: Filter, fn(&(usize, &str)) -> bool>, + inner: Box + 'a>, } impl<'a> Iterator for UnicodeWordIndices<'a> { type Item = (usize, &'a str); - #[inline] + #[inline(always)] fn next(&mut self) -> Option<(usize, &'a str)> { self.inner.next() } @@ -722,12 +727,12 @@ impl<'a> AsciiWordBoundIter<'a> { AsciiWordBoundIter { rest: s, offset: 0 } } - #[inline(always)] + #[inline] fn is_core(b: u8) -> bool { b.is_ascii_alphanumeric() || b == b'_' } - #[inline(always)] + #[inline] fn is_infix(b: u8, prev: u8, next: u8) -> bool { match b { // numeric separators @@ -744,6 +749,7 @@ impl<'a> AsciiWordBoundIter<'a> { impl<'a> Iterator for AsciiWordBoundIter<'a> { type Item = (usize, &'a str); + #[inline] fn next(&mut self) -> Option { if self.rest.is_empty() { return None; @@ -802,6 +808,66 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> { } } +impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> { + fn next_back(&mut self) -> Option<(usize, &'a str)> { + let rest = self.rest; + if rest.is_empty() { + return None; + } + let bytes = rest.as_bytes(); + let len = bytes.len(); + + // 1) Trailing spaces + if bytes[len - 1] == b' ' { + // find start of this last run of spaces + let mut start = len - 1; + while start > 0 && bytes[start - 1] == b' ' { + start -= 1; + } + let word = &rest[start..]; + let pos = self.offset + start; + self.rest = &rest[..start]; + return Some((pos, word)); + } + + // 2) Trailing core-run (letters/digits/underscore + infix) + if Self::is_core(bytes[len - 1]) { + // scan backwards as long as we see `is_core` or an `is_infix` + let mut start = len - 1; + while start > 0 { + let b = bytes[start - 1]; + let prev = if start >= 2 { bytes[start - 2] } else { b }; + let next = bytes[start]; // the byte we just included + if Self::is_core(b) || Self::is_infix(b, prev, next) { + start -= 1; + } else { + break; + } + } + let word = &rest[start..]; + let pos = self.offset + start; + self.rest = &rest[..start]; + return Some((pos, word)); + } + + // 3) CR+LF at end + if len >= 2 && bytes[len - 2] == b'\r' && bytes[len - 1] == b'\n' { + let start = len - 2; + let word = &rest[start..]; + let pos = self.offset + start; + self.rest = &rest[..start]; + return Some((pos, word)); + } + + // 4) Single non-core byte + let start = len - 1; + let word = &rest[start..]; + let pos = self.offset + start; + self.rest = &rest[..start]; + Some((pos, word)) + } +} + #[inline] pub fn new_word_bounds(s: &str) -> UWordBounds<'_> { UWordBounds { @@ -832,20 +898,25 @@ fn has_alphanumeric(s: &&str) -> bool { } #[inline] -fn new_unicode_words_ascii<'a>(s: &'a str) -> impl Iterator + 'a { +fn has_ascii_alphanumeric(s: &&str) -> bool { + s.chars().any(|c| c.is_ascii_alphanumeric()) +} + +#[inline] +fn new_unicode_words_ascii<'a>(s: &'a str) -> impl DoubleEndedIterator + 'a { new_ascii_word_bound_indices(s) .map(|(_, w)| w) - .filter(|w| w.chars().any(|c| c.is_ascii_alphanumeric())) + .filter(has_ascii_alphanumeric) } #[inline] -fn new_unicode_words_general<'a>(s: &'a str) -> impl Iterator + 'a { +fn new_unicode_words_general<'a>(s: &'a str) -> impl DoubleEndedIterator + 'a { new_word_bounds(s).filter(has_alphanumeric) } #[inline] pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { - let iter: Box> = if s.is_ascii() { + let iter: Box> = if s.is_ascii() { Box::new(new_unicode_words_ascii(s)) } else { Box::new(new_unicode_words_general(s)) @@ -855,14 +926,13 @@ pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { } #[inline] -pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> { - use super::UnicodeSegmentation; - - UnicodeWordIndices { - inner: s - .split_word_bound_indices() - .filter(|(_, c)| has_alphanumeric(c)), - } +pub fn new_unicode_word_indices<'a>(s: &'a str) -> UnicodeWordIndices<'a> { + let iter: Box> = if s.is_ascii() { + Box::new(new_ascii_word_bound_indices(s).filter(|(_, w)| has_ascii_alphanumeric(w))) + } else { + Box::new(new_word_bound_indices(s).filter(|(_, w)| has_alphanumeric(w))) + }; + UnicodeWordIndices { inner: iter } } #[cfg(test)] @@ -921,5 +991,17 @@ mod tests { prop_assert_eq!(fast, uni); } + + /// Fast path must equal general path for any ASCII input, forwards and backwards. + #[test] + fn proptest_ascii_matches_unicode_word_indices_rev( + // Vec → String, length 0‒99 + s in proptest::collection::vec(ascii_char(), 0..100) + .prop_map(|v| v.into_iter().collect::()) + ) { + let fast_rev: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).rev().collect(); + let uni_rev : Vec<(usize, &str)> = new_word_bound_indices(&s).rev().collect(); + prop_assert_eq!(fast_rev, uni_rev); + } } } From a3881da71bb3b4747713d149ef61444b91a43faf Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 15 Jul 2025 20:40:35 +0800 Subject: [PATCH 07/13] restore test --- src/word.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/word.rs b/src/word.rs index c835445..b6e042b 100644 --- a/src/word.rs +++ b/src/word.rs @@ -948,6 +948,13 @@ mod tests { #[test] fn test_syriac_abbr_mark() { + use crate::tables::word as wd; + let (_, _, cat) = wd::word_category('\u{70f}'); + assert_eq!(cat, wd::WC_ALetter); + } + + #[test] + fn test_end_of_ayah_cat() { use crate::tables::word as wd; let (_, _, cat) = wd::word_category('\u{6dd}'); assert_eq!(cat, wd::WC_Numeric); From 7599d624aa21b1bdd8bda7b1d12dfa37a2a690ef Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Wed, 16 Jul 2025 20:15:20 +0800 Subject: [PATCH 08/13] replace Box with Enum --- src/word.rs | 135 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 91 insertions(+), 44 deletions(-) diff --git a/src/word.rs b/src/word.rs index b6e042b..f4bd9e1 100644 --- a/src/word.rs +++ b/src/word.rs @@ -9,7 +9,6 @@ // except according to those terms. extern crate alloc; -use alloc::boxed::Box; use core::cmp; use crate::tables::word::WordCat; @@ -27,27 +26,33 @@ use crate::tables::word::WordCat; /// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html pub struct UnicodeWords<'a> { - inner: Box + 'a>, + inner: WordsIter<'a>, } impl<'a> Iterator for UnicodeWords<'a> { type Item = &'a str; - #[inline] - fn next(&mut self) -> Option<&'a str> { - self.inner.next() + fn next(&mut self) -> Option { + match &mut self.inner { + WordsIter::Ascii(i) => i.next(), + WordsIter::Unicode(i) => i.next(), + } } - #[inline] fn size_hint(&self) -> (usize, Option) { - self.inner.size_hint() + match &self.inner { + WordsIter::Ascii(i) => i.size_hint(), + WordsIter::Unicode(i) => i.size_hint(), + } } } - impl<'a> DoubleEndedIterator for UnicodeWords<'a> { #[inline] - fn next_back(&mut self) -> Option<&'a str> { - self.inner.next_back() + fn next_back(&mut self) -> Option { + match &mut self.inner { + WordsIter::Ascii(i) => i.next_back(), + WordsIter::Unicode(i) => i.next_back(), + } } } @@ -65,27 +70,33 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> { /// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html pub struct UnicodeWordIndices<'a> { - #[allow(clippy::type_complexity)] - inner: Box + 'a>, + inner: IndicesIter<'a>, } impl<'a> Iterator for UnicodeWordIndices<'a> { type Item = (usize, &'a str); - - #[inline(always)] - fn next(&mut self) -> Option<(usize, &'a str)> { - self.inner.next() + #[inline] + fn next(&mut self) -> Option { + match &mut self.inner { + IndicesIter::Ascii(i) => i.next(), + IndicesIter::Unicode(i) => i.next(), + } } - #[inline] fn size_hint(&self) -> (usize, Option) { - self.inner.size_hint() + match &self.inner { + IndicesIter::Ascii(i) => i.size_hint(), + IndicesIter::Unicode(i) => i.size_hint(), + } } } impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> { #[inline] - fn next_back(&mut self) -> Option<(usize, &'a str)> { - self.inner.next_back() + fn next_back(&mut self) -> Option { + match &mut self.inner { + IndicesIter::Ascii(i) => i.next_back(), + IndicesIter::Unicode(i) => i.next_back(), + } } } @@ -868,6 +879,58 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> { } } +#[inline] +fn ascii_word_ok(t: &(usize, &str)) -> bool { + has_ascii_alphanumeric(&t.1) +} +#[inline] +fn unicode_word_ok(t: &(usize, &str)) -> bool { + has_alphanumeric(&t.1) +} + +type AsciiWordsIter<'a> = core::iter::Filter< + core::iter::Map, fn((usize, &'a str)) -> &'a str>, + fn(&&'a str) -> bool, +>; + +type UnicodeWordsIter<'a> = core::iter::Filter, fn(&&'a str) -> bool>; + +type AsciiIndicesIter<'a> = + core::iter::Filter, fn(&(usize, &'a str)) -> bool>; + +type UnicodeIndicesIter<'a> = + core::iter::Filter, fn(&(usize, &'a str)) -> bool>; + +enum WordsIter<'a> { + Ascii(AsciiWordsIter<'a>), + Unicode(UnicodeWordsIter<'a>), +} + +enum IndicesIter<'a> { + Ascii(AsciiIndicesIter<'a>), + Unicode(UnicodeIndicesIter<'a>), +} + +#[inline] +pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { + let inner = if s.is_ascii() { + WordsIter::Ascii(new_unicode_words_ascii(s)) + } else { + WordsIter::Unicode(new_unicode_words_general(s)) + }; + UnicodeWords { inner } +} + +#[inline] +pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> { + let inner = if s.is_ascii() { + IndicesIter::Ascii(new_ascii_word_bound_indices(s).filter(ascii_word_ok)) + } else { + IndicesIter::Unicode(new_word_bound_indices(s).filter(unicode_word_ok)) + }; + UnicodeWordIndices { inner } +} + #[inline] pub fn new_word_bounds(s: &str) -> UWordBounds<'_> { UWordBounds { @@ -902,39 +965,23 @@ fn has_ascii_alphanumeric(s: &&str) -> bool { s.chars().any(|c| c.is_ascii_alphanumeric()) } +#[inline(always)] +fn strip_pos((_, w): (usize, &str)) -> &str { + w +} + #[inline] -fn new_unicode_words_ascii<'a>(s: &'a str) -> impl DoubleEndedIterator + 'a { +fn new_unicode_words_ascii<'a>(s: &'a str) -> AsciiWordsIter<'a> { new_ascii_word_bound_indices(s) - .map(|(_, w)| w) + .map(strip_pos as fn(_) -> _) .filter(has_ascii_alphanumeric) } #[inline] -fn new_unicode_words_general<'a>(s: &'a str) -> impl DoubleEndedIterator + 'a { +fn new_unicode_words_general<'a>(s: &'a str) -> UnicodeWordsIter<'a> { new_word_bounds(s).filter(has_alphanumeric) } -#[inline] -pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { - let iter: Box> = if s.is_ascii() { - Box::new(new_unicode_words_ascii(s)) - } else { - Box::new(new_unicode_words_general(s)) - }; - - UnicodeWords { inner: iter } -} - -#[inline] -pub fn new_unicode_word_indices<'a>(s: &'a str) -> UnicodeWordIndices<'a> { - let iter: Box> = if s.is_ascii() { - Box::new(new_ascii_word_bound_indices(s).filter(|(_, w)| has_ascii_alphanumeric(w))) - } else { - Box::new(new_word_bound_indices(s).filter(|(_, w)| has_alphanumeric(w))) - }; - UnicodeWordIndices { inner: iter } -} - #[cfg(test)] mod tests { use crate::word::{ From e29c432017fbb58b05b8703eba0766e9caa5b914 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 17 Jul 2025 13:04:37 +0800 Subject: [PATCH 09/13] add comments with reference to the spec --- src/word.rs | 61 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/src/word.rs b/src/word.rs index f4bd9e1..b2d1c37 100644 --- a/src/word.rs +++ b/src/word.rs @@ -728,6 +728,31 @@ impl<'a> UWordBounds<'a> { } } +/// ASCII‑fast‑path word‑boundary iterator for strings that contain only ASCII characters. +/// +/// Since we handle only ASCII characters, we can use a much simpler set of +/// word break values than the full Unicode algorithm. +/// https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values +/// +/// | Word_Break value | ASCII code points that belong to it | +/// | -----------------| --------------------------------------------------------------- | +/// | CR | U+000D (CR) | +/// | LF | U+000A (LF) | +/// | Newline | U+000B (VT), U+000C (FF) | +/// | Single_Quote | U+0027 (') | +/// | Double_Quote | U+0022 (") | +/// | MidNumLet | U+002E (.) FULL STOP | +/// | MidLetter | U+003A (:) COLON | +/// | MidNum | U+002C (,), U+003B (;) | +/// | Numeric | U+0030 – U+0039 (0 … 9) | +/// | ALetter | U+0041 – U+005A (A … Z), U+0061 – U+007A (a … z) | +/// | ExtendNumLet | U+005F (_) underscore | +/// | WSegSpace | U+0020 (SPACE) | +/// +/// The macro MidNumLetQ boils down to: U+002E (.) FULL STOP and U+0027 (') +/// AHLetter is the same as ALetter, so we don't need to distinguish it. +/// +/// Any other single ASCII byte is its own boundary (the default WB999). pub struct AsciiWordBoundIter<'a> { rest: &'a str, offset: usize, @@ -746,12 +771,17 @@ impl<'a> AsciiWordBoundIter<'a> { #[inline] fn is_infix(b: u8, prev: u8, next: u8) -> bool { match b { - // numeric separators + // Numeric separators such as "1,000" or "3.14" (WB11/WB12) + // + // "Numeric (MidNum | MidNumLetQ) Numeric" b'.' | b',' | b';' | b'\'' if prev.is_ascii_digit() && next.is_ascii_digit() => true, - // apostrophe in contractions - b'\'' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true, - // dot/colon inside letters - b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true, + + // Dot or colon inside an alphabetic word ("e.g.", "http://") (WB6/WB7) + // + // "(MidLetter | MidNumLetQ) AHLetter (MidLetter | MidNumLetQ)" + // MidLetter = b':' + // MidNumLetQ = b'.' | b'\'' + b'\'' | b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true, _ => false, } } @@ -769,7 +799,8 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> { let bytes = self.rest.as_bytes(); let len = bytes.len(); - // 1) Group runs of spaces + // 1) Keep horizontal whitespace together. + // Spec: WB3d joins adjacent *WSegSpace* into a single segment. if bytes[0] == b' ' { let mut i = 1; while i < len && bytes[i] == b' ' { @@ -783,6 +814,7 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> { } // 2) Core-run (letters/digits/underscore + infix) + // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b) if Self::is_core(bytes[0]) { let mut i = 1; while i < len { @@ -802,7 +834,8 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> { return Some((pos, word)); } - // 3) Non-core: CR+LF as one token, otherwise single char + // 3) Do not break within CRLF. + // Spec: WB3 treats CR+LF as a single non‑breaking pair. if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' { let word = &self.rest[..2]; let pos = self.offset; @@ -810,6 +843,8 @@ impl<'a> Iterator for AsciiWordBoundIter<'a> { self.offset += 2; Some((pos, word)) } else { + // 4) Otherwise, break everywhere + // Spec: the catch‑all rule WB999. let word = &self.rest[..1]; let pos = self.offset; self.rest = &self.rest[1..]; @@ -828,7 +863,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> { let bytes = rest.as_bytes(); let len = bytes.len(); - // 1) Trailing spaces + // 1) Group runs of spaces + // Spec: WB3d joins adjacent *WSegSpace* into a single segment. if bytes[len - 1] == b' ' { // find start of this last run of spaces let mut start = len - 1; @@ -841,7 +877,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> { return Some((pos, word)); } - // 2) Trailing core-run (letters/digits/underscore + infix) + // 2) Trailing Core-run (letters/digits/underscore + infix) + // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b) if Self::is_core(bytes[len - 1]) { // scan backwards as long as we see `is_core` or an `is_infix` let mut start = len - 1; @@ -861,7 +898,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> { return Some((pos, word)); } - // 3) CR+LF at end + // 3) Non-core: CR+LF as one token, otherwise single char + // Spec: WB3 treats CR+LF as a single non‑breaking pair. if len >= 2 && bytes[len - 2] == b'\r' && bytes[len - 1] == b'\n' { let start = len - 2; let word = &rest[start..]; @@ -870,7 +908,8 @@ impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> { return Some((pos, word)); } - // 4) Single non-core byte + // 4) Fallback – every other byte is its own segment + // Spec: the catch‑all rule WB999. let start = len - 1; let word = &rest[start..]; let pos = self.offset + start; From 5a09f28848d33e3a960f68637ff2743cd085a3dd Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 17 Jul 2025 13:18:59 +0800 Subject: [PATCH 10/13] remove unused alloc --- src/word.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/word.rs b/src/word.rs index b2d1c37..aa6cbcd 100644 --- a/src/word.rs +++ b/src/word.rs @@ -8,7 +8,6 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -extern crate alloc; use core::cmp; use crate::tables::word::WordCat; From f76a997b57d19b829a9427e7f4d14f810dbfc3f8 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 17 Jul 2025 13:21:09 +0800 Subject: [PATCH 11/13] readd Debug derive --- src/word.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/word.rs b/src/word.rs index aa6cbcd..4077e9f 100644 --- a/src/word.rs +++ b/src/word.rs @@ -24,6 +24,7 @@ use crate::tables::word::WordCat; /// /// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html +#[derive(Debug)] pub struct UnicodeWords<'a> { inner: WordsIter<'a>, } @@ -68,6 +69,7 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> { /// /// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html +#[derive(Debug)] pub struct UnicodeWordIndices<'a> { inner: IndicesIter<'a>, } @@ -752,6 +754,7 @@ impl<'a> UWordBounds<'a> { /// AHLetter is the same as ALetter, so we don't need to distinguish it. /// /// Any other single ASCII byte is its own boundary (the default WB999). +#[derive(Debug)] pub struct AsciiWordBoundIter<'a> { rest: &'a str, offset: usize, @@ -939,11 +942,13 @@ type AsciiIndicesIter<'a> = type UnicodeIndicesIter<'a> = core::iter::Filter, fn(&(usize, &'a str)) -> bool>; +#[derive(Debug)] enum WordsIter<'a> { Ascii(AsciiWordsIter<'a>), Unicode(UnicodeWordsIter<'a>), } +#[derive(Debug)] enum IndicesIter<'a> { Ascii(AsciiIndicesIter<'a>), Unicode(UnicodeIndicesIter<'a>), From b556333ca894f4a547d5b6fb1dca0ef9991ec973 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Thu, 17 Jul 2025 13:22:29 +0800 Subject: [PATCH 12/13] use import --- src/word.rs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/word.rs b/src/word.rs index 4077e9f..fdd128b 100644 --- a/src/word.rs +++ b/src/word.rs @@ -9,6 +9,7 @@ // except according to those terms. use core::cmp; +use core::iter::Filter; use crate::tables::word::WordCat; @@ -929,18 +930,13 @@ fn unicode_word_ok(t: &(usize, &str)) -> bool { has_alphanumeric(&t.1) } -type AsciiWordsIter<'a> = core::iter::Filter< +type AsciiWordsIter<'a> = Filter< core::iter::Map, fn((usize, &'a str)) -> &'a str>, fn(&&'a str) -> bool, >; - -type UnicodeWordsIter<'a> = core::iter::Filter, fn(&&'a str) -> bool>; - -type AsciiIndicesIter<'a> = - core::iter::Filter, fn(&(usize, &'a str)) -> bool>; - -type UnicodeIndicesIter<'a> = - core::iter::Filter, fn(&(usize, &'a str)) -> bool>; +type UnicodeWordsIter<'a> = Filter, fn(&&'a str) -> bool>; +type AsciiIndicesIter<'a> = Filter, fn(&(usize, &'a str)) -> bool>; +type UnicodeIndicesIter<'a> = Filter, fn(&(usize, &'a str)) -> bool>; #[derive(Debug)] enum WordsIter<'a> { From 0e7674a40541b4baadfe59c65a28de68dcb8db40 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 18 Jul 2025 16:08:46 +0800 Subject: [PATCH 13/13] remove pub --- src/word.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/word.rs b/src/word.rs index fdd128b..1a46b39 100644 --- a/src/word.rs +++ b/src/word.rs @@ -756,7 +756,7 @@ impl<'a> UWordBounds<'a> { /// /// Any other single ASCII byte is its own boundary (the default WB999). #[derive(Debug)] -pub struct AsciiWordBoundIter<'a> { +struct AsciiWordBoundIter<'a> { rest: &'a str, offset: usize, } @@ -988,7 +988,7 @@ pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> { } #[inline] -pub fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> { +fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> { AsciiWordBoundIter::new(s) } @@ -1046,6 +1046,20 @@ mod tests { assert_eq!(cat, wd::WC_Numeric); } + #[test] + fn test_ascii_word_bound_indices_various_cases() { + let s = "Hello, world!"; + let words: Vec<(usize, &str)> = new_ascii_word_bound_indices(s).collect(); + let expected = vec![ + (0, "Hello"), // simple letters + (5, ","), + (6, " "), // space after comma + (7, "world"), // skip comma+space, stop at '!' + (12, "!"), // punctuation at the end + ]; + assert_eq!(words, expected); + } + #[test] fn test_ascii_word_indices_various_cases() { let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com 127.0.0.1:9090"; pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy