diff --git a/Cargo.toml b/Cargo.toml index 404f053..1aac6ea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ no_std = [] # This is a no-op, preserved for backward compatibility only. [dev-dependencies] quickcheck = "0.7" criterion = "0.5" +proptest = "1.7.0" [[bench]] name = "chars" @@ -36,3 +37,8 @@ harness = false [[bench]] name = "word_bounds" harness = false + +[[bench]] +name = "unicode_word_indices" +harness = false + diff --git a/benches/chars.rs b/benches/chars.rs index bacffa1..2654a26 100644 --- a/benches/chars.rs +++ b/benches/chars.rs @@ -41,7 +41,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("grapheme", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(), |b, content| b.iter(|| grapheme(content)), ); } @@ -49,7 +49,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("scalar", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(), |b, content| b.iter(|| scalar(content)), ); } diff --git a/benches/texts/log.txt b/benches/texts/log.txt new file mode 100644 index 0000000..e18ca32 --- /dev/null +++ b/benches/texts/log.txt @@ -0,0 +1 @@ +2018-07-12 13:59:01 UTC | ERROR | (worker.go:131 in process) | Too many errors for endpoint 'dummy/api/v1/check_run?api_key=*************************00000': retrying later diff --git a/benches/unicode_word_indices.rs b/benches/unicode_word_indices.rs new file mode 100644 index 0000000..4c09404 --- /dev/null +++ b/benches/unicode_word_indices.rs @@ -0,0 +1,37 @@ +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; + +use std::fs; +use unicode_segmentation::UnicodeSegmentation; + +const FILES: &[&str] = &[ + "log", //"arabic", + "english", + //"hindi", + "japanese", + //"korean", + //"mandarin", + //"russian", + //"source_code", +]; + +#[inline(always)] +fn grapheme(text: &str) { + for w in text.unicode_word_indices() { + black_box(w); + } +} + +fn bench_all(c: &mut Criterion) { + let mut group = c.benchmark_group("unicode_word_indices"); + + for file in FILES { + let input = fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(); + group.throughput(criterion::Throughput::Bytes(input.len() as u64)); + group.bench_with_input(BenchmarkId::from_parameter(file), &input, |b, content| { + b.iter(|| grapheme(content)) + }); + } +} + +criterion_group!(benches, bench_all); +criterion_main!(benches); diff --git a/benches/word_bounds.rs b/benches/word_bounds.rs index 42d50ff..f1af7c4 100644 --- a/benches/word_bounds.rs +++ b/benches/word_bounds.rs @@ -27,7 +27,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("grapheme", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt",)).unwrap(), |b, content| b.iter(|| grapheme(content)), ); } diff --git a/benches/words.rs b/benches/words.rs index 86785d5..508bc9f 100644 --- a/benches/words.rs +++ b/benches/words.rs @@ -41,7 +41,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("grapheme", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(), |b, content| b.iter(|| grapheme(content)), ); } @@ -49,7 +49,7 @@ fn bench_all(c: &mut Criterion) { for file in FILES { group.bench_with_input( BenchmarkId::new("scalar", file), - &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + &fs::read_to_string(format!("benches/texts/{file}.txt")).unwrap(), |b, content| b.iter(|| scalar(content)), ); } diff --git a/src/lib.rs b/src/lib.rs index c8ec5b5..d15ac0b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -56,11 +56,16 @@ )] #![no_std] +#[cfg(test)] +extern crate std; + pub use grapheme::{GraphemeCursor, GraphemeIncomplete}; pub use grapheme::{GraphemeIndices, Graphemes}; pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences}; pub use tables::UNICODE_VERSION; -pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords}; +pub use word::{UWordBoundIndices, UWordBounds}; + +use crate::word::{UnicodeWordIndices, UnicodeWords}; mod grapheme; mod sentence; @@ -248,7 +253,7 @@ pub trait UnicodeSegmentation { impl UnicodeSegmentation for str { #[inline] - fn graphemes(&self, is_extended: bool) -> Graphemes { + fn graphemes(&self, is_extended: bool) -> Graphemes<'_> { grapheme::new_graphemes(self, is_extended) } @@ -258,32 +263,32 @@ impl UnicodeSegmentation for str { } #[inline] - fn unicode_words(&self) -> UnicodeWords { + fn unicode_words(&self) -> UnicodeWords<'_> { word::new_unicode_words(self) } #[inline] - fn unicode_word_indices(&self) -> UnicodeWordIndices { + fn unicode_word_indices(&self) -> UnicodeWordIndices<'_> { word::new_unicode_word_indices(self) } #[inline] - fn split_word_bounds(&self) -> UWordBounds { + fn split_word_bounds(&self) -> UWordBounds<'_> { word::new_word_bounds(self) } #[inline] - fn split_word_bound_indices(&self) -> UWordBoundIndices { + fn split_word_bound_indices(&self) -> UWordBoundIndices<'_> { word::new_word_bound_indices(self) } #[inline] - fn unicode_sentences(&self) -> UnicodeSentences { + fn unicode_sentences(&self) -> UnicodeSentences<'_> { sentence::new_unicode_sentences(self) } #[inline] - fn split_sentence_bounds(&self) -> USentenceBounds { + fn split_sentence_bounds(&self) -> USentenceBounds<'_> { sentence::new_sentence_bounds(self) } diff --git a/src/word.rs b/src/word.rs index b2a85ae..1a46b39 100644 --- a/src/word.rs +++ b/src/word.rs @@ -27,26 +27,33 @@ use crate::tables::word::WordCat; /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html #[derive(Debug)] pub struct UnicodeWords<'a> { - inner: Filter, fn(&&str) -> bool>, + inner: WordsIter<'a>, } impl<'a> Iterator for UnicodeWords<'a> { type Item = &'a str; - #[inline] - fn next(&mut self) -> Option<&'a str> { - self.inner.next() + fn next(&mut self) -> Option { + match &mut self.inner { + WordsIter::Ascii(i) => i.next(), + WordsIter::Unicode(i) => i.next(), + } } - #[inline] fn size_hint(&self) -> (usize, Option) { - self.inner.size_hint() + match &self.inner { + WordsIter::Ascii(i) => i.size_hint(), + WordsIter::Unicode(i) => i.size_hint(), + } } } impl<'a> DoubleEndedIterator for UnicodeWords<'a> { #[inline] - fn next_back(&mut self) -> Option<&'a str> { - self.inner.next_back() + fn next_back(&mut self) -> Option { + match &mut self.inner { + WordsIter::Ascii(i) => i.next_back(), + WordsIter::Unicode(i) => i.next_back(), + } } } @@ -65,27 +72,33 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> { /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html #[derive(Debug)] pub struct UnicodeWordIndices<'a> { - #[allow(clippy::type_complexity)] - inner: Filter, fn(&(usize, &str)) -> bool>, + inner: IndicesIter<'a>, } impl<'a> Iterator for UnicodeWordIndices<'a> { type Item = (usize, &'a str); - #[inline] - fn next(&mut self) -> Option<(usize, &'a str)> { - self.inner.next() + fn next(&mut self) -> Option { + match &mut self.inner { + IndicesIter::Ascii(i) => i.next(), + IndicesIter::Unicode(i) => i.next(), + } } - #[inline] fn size_hint(&self) -> (usize, Option) { - self.inner.size_hint() + match &self.inner { + IndicesIter::Ascii(i) => i.size_hint(), + IndicesIter::Unicode(i) => i.size_hint(), + } } } impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> { #[inline] - fn next_back(&mut self) -> Option<(usize, &'a str)> { - self.inner.next_back() + fn next_back(&mut self) -> Option { + match &mut self.inner { + IndicesIter::Ascii(i) => i.next_back(), + IndicesIter::Unicode(i) => i.next_back(), + } } } @@ -717,6 +730,246 @@ impl<'a> UWordBounds<'a> { } } +/// ASCII‑fast‑path word‑boundary iterator for strings that contain only ASCII characters. +/// +/// Since we handle only ASCII characters, we can use a much simpler set of +/// word break values than the full Unicode algorithm. +/// https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values +/// +/// | Word_Break value | ASCII code points that belong to it | +/// | -----------------| --------------------------------------------------------------- | +/// | CR | U+000D (CR) | +/// | LF | U+000A (LF) | +/// | Newline | U+000B (VT), U+000C (FF) | +/// | Single_Quote | U+0027 (') | +/// | Double_Quote | U+0022 (") | +/// | MidNumLet | U+002E (.) FULL STOP | +/// | MidLetter | U+003A (:) COLON | +/// | MidNum | U+002C (,), U+003B (;) | +/// | Numeric | U+0030 – U+0039 (0 … 9) | +/// | ALetter | U+0041 – U+005A (A … Z), U+0061 – U+007A (a … z) | +/// | ExtendNumLet | U+005F (_) underscore | +/// | WSegSpace | U+0020 (SPACE) | +/// +/// The macro MidNumLetQ boils down to: U+002E (.) FULL STOP and U+0027 (') +/// AHLetter is the same as ALetter, so we don't need to distinguish it. +/// +/// Any other single ASCII byte is its own boundary (the default WB999). +#[derive(Debug)] +struct AsciiWordBoundIter<'a> { + rest: &'a str, + offset: usize, +} + +impl<'a> AsciiWordBoundIter<'a> { + pub fn new(s: &'a str) -> Self { + AsciiWordBoundIter { rest: s, offset: 0 } + } + + #[inline] + fn is_core(b: u8) -> bool { + b.is_ascii_alphanumeric() || b == b'_' + } + + #[inline] + fn is_infix(b: u8, prev: u8, next: u8) -> bool { + match b { + // Numeric separators such as "1,000" or "3.14" (WB11/WB12) + // + // "Numeric (MidNum | MidNumLetQ) Numeric" + b'.' | b',' | b';' | b'\'' if prev.is_ascii_digit() && next.is_ascii_digit() => true, + + // Dot or colon inside an alphabetic word ("e.g.", "http://") (WB6/WB7) + // + // "(MidLetter | MidNumLetQ) AHLetter (MidLetter | MidNumLetQ)" + // MidLetter = b':' + // MidNumLetQ = b'.' | b'\'' + b'\'' | b'.' | b':' if prev.is_ascii_alphabetic() && next.is_ascii_alphabetic() => true, + _ => false, + } + } +} + +impl<'a> Iterator for AsciiWordBoundIter<'a> { + type Item = (usize, &'a str); + + #[inline] + fn next(&mut self) -> Option { + if self.rest.is_empty() { + return None; + } + + let bytes = self.rest.as_bytes(); + let len = bytes.len(); + + // 1) Keep horizontal whitespace together. + // Spec: WB3d joins adjacent *WSegSpace* into a single segment. + if bytes[0] == b' ' { + let mut i = 1; + while i < len && bytes[i] == b' ' { + i += 1; + } + let word = &self.rest[..i]; + let pos = self.offset; + self.rest = &self.rest[i..]; + self.offset += i; + return Some((pos, word)); + } + + // 2) Core-run (letters/digits/underscore + infix) + // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b) + if Self::is_core(bytes[0]) { + let mut i = 1; + while i < len { + let b = bytes[i]; + if Self::is_core(b) + || (i + 1 < len && Self::is_infix(b, bytes[i - 1], bytes[i + 1])) + { + i += 1; + } else { + break; + } + } + let word = &self.rest[..i]; + let pos = self.offset; + self.rest = &self.rest[i..]; + self.offset += i; + return Some((pos, word)); + } + + // 3) Do not break within CRLF. + // Spec: WB3 treats CR+LF as a single non‑breaking pair. + if bytes[0] == b'\r' && len >= 2 && bytes[1] == b'\n' { + let word = &self.rest[..2]; + let pos = self.offset; + self.rest = &self.rest[2..]; + self.offset += 2; + Some((pos, word)) + } else { + // 4) Otherwise, break everywhere + // Spec: the catch‑all rule WB999. + let word = &self.rest[..1]; + let pos = self.offset; + self.rest = &self.rest[1..]; + self.offset += 1; + Some((pos, word)) + } + } +} + +impl<'a> DoubleEndedIterator for AsciiWordBoundIter<'a> { + fn next_back(&mut self) -> Option<(usize, &'a str)> { + let rest = self.rest; + if rest.is_empty() { + return None; + } + let bytes = rest.as_bytes(); + let len = bytes.len(); + + // 1) Group runs of spaces + // Spec: WB3d joins adjacent *WSegSpace* into a single segment. + if bytes[len - 1] == b' ' { + // find start of this last run of spaces + let mut start = len - 1; + while start > 0 && bytes[start - 1] == b' ' { + start -= 1; + } + let word = &rest[start..]; + let pos = self.offset + start; + self.rest = &rest[..start]; + return Some((pos, word)); + } + + // 2) Trailing Core-run (letters/digits/underscore + infix) + // Spec: ALetter × ALetter, Numeric × Numeric etc. (WB5–WB13b) + if Self::is_core(bytes[len - 1]) { + // scan backwards as long as we see `is_core` or an `is_infix` + let mut start = len - 1; + while start > 0 { + let b = bytes[start - 1]; + let prev = if start >= 2 { bytes[start - 2] } else { b }; + let next = bytes[start]; // the byte we just included + if Self::is_core(b) || Self::is_infix(b, prev, next) { + start -= 1; + } else { + break; + } + } + let word = &rest[start..]; + let pos = self.offset + start; + self.rest = &rest[..start]; + return Some((pos, word)); + } + + // 3) Non-core: CR+LF as one token, otherwise single char + // Spec: WB3 treats CR+LF as a single non‑breaking pair. + if len >= 2 && bytes[len - 2] == b'\r' && bytes[len - 1] == b'\n' { + let start = len - 2; + let word = &rest[start..]; + let pos = self.offset + start; + self.rest = &rest[..start]; + return Some((pos, word)); + } + + // 4) Fallback – every other byte is its own segment + // Spec: the catch‑all rule WB999. + let start = len - 1; + let word = &rest[start..]; + let pos = self.offset + start; + self.rest = &rest[..start]; + Some((pos, word)) + } +} + +#[inline] +fn ascii_word_ok(t: &(usize, &str)) -> bool { + has_ascii_alphanumeric(&t.1) +} +#[inline] +fn unicode_word_ok(t: &(usize, &str)) -> bool { + has_alphanumeric(&t.1) +} + +type AsciiWordsIter<'a> = Filter< + core::iter::Map, fn((usize, &'a str)) -> &'a str>, + fn(&&'a str) -> bool, +>; +type UnicodeWordsIter<'a> = Filter, fn(&&'a str) -> bool>; +type AsciiIndicesIter<'a> = Filter, fn(&(usize, &'a str)) -> bool>; +type UnicodeIndicesIter<'a> = Filter, fn(&(usize, &'a str)) -> bool>; + +#[derive(Debug)] +enum WordsIter<'a> { + Ascii(AsciiWordsIter<'a>), + Unicode(UnicodeWordsIter<'a>), +} + +#[derive(Debug)] +enum IndicesIter<'a> { + Ascii(AsciiIndicesIter<'a>), + Unicode(UnicodeIndicesIter<'a>), +} + +#[inline] +pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { + let inner = if s.is_ascii() { + WordsIter::Ascii(new_unicode_words_ascii(s)) + } else { + WordsIter::Unicode(new_unicode_words_general(s)) + }; + UnicodeWords { inner } +} + +#[inline] +pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> { + let inner = if s.is_ascii() { + IndicesIter::Ascii(new_ascii_word_bound_indices(s).filter(ascii_word_ok)) + } else { + IndicesIter::Unicode(new_word_bound_indices(s).filter(unicode_word_ok)) + }; + UnicodeWordIndices { inner } +} + #[inline] pub fn new_word_bounds(s: &str) -> UWordBounds<'_> { UWordBounds { @@ -734,6 +987,11 @@ pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> { } } +#[inline] +fn new_ascii_word_bound_indices(s: &str) -> AsciiWordBoundIter<'_> { + AsciiWordBoundIter::new(s) +} + #[inline] fn has_alphanumeric(s: &&str) -> bool { use crate::tables::util::is_alphanumeric; @@ -742,27 +1000,38 @@ fn has_alphanumeric(s: &&str) -> bool { } #[inline] -pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { - use super::UnicodeSegmentation; +fn has_ascii_alphanumeric(s: &&str) -> bool { + s.chars().any(|c| c.is_ascii_alphanumeric()) +} - UnicodeWords { - inner: s.split_word_bounds().filter(has_alphanumeric), - } +#[inline(always)] +fn strip_pos((_, w): (usize, &str)) -> &str { + w } #[inline] -pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> { - use super::UnicodeSegmentation; +fn new_unicode_words_ascii<'a>(s: &'a str) -> AsciiWordsIter<'a> { + new_ascii_word_bound_indices(s) + .map(strip_pos as fn(_) -> _) + .filter(has_ascii_alphanumeric) +} - UnicodeWordIndices { - inner: s - .split_word_bound_indices() - .filter(|(_, c)| has_alphanumeric(c)), - } +#[inline] +fn new_unicode_words_general<'a>(s: &'a str) -> UnicodeWordsIter<'a> { + new_word_bounds(s).filter(has_alphanumeric) } #[cfg(test)] mod tests { + use crate::word::{ + new_ascii_word_bound_indices, new_unicode_words_ascii, new_word_bound_indices, + }; + use std::string::String; + use std::vec::Vec; + use std::{format, vec}; + + use proptest::prelude::*; + #[test] fn test_syriac_abbr_mark() { use crate::tables::word as wd; @@ -776,4 +1045,70 @@ mod tests { let (_, _, cat) = wd::word_category('\u{6dd}'); assert_eq!(cat, wd::WC_Numeric); } + + #[test] + fn test_ascii_word_bound_indices_various_cases() { + let s = "Hello, world!"; + let words: Vec<(usize, &str)> = new_ascii_word_bound_indices(s).collect(); + let expected = vec![ + (0, "Hello"), // simple letters + (5, ","), + (6, " "), // space after comma + (7, "world"), // skip comma+space, stop at '!' + (12, "!"), // punctuation at the end + ]; + assert_eq!(words, expected); + } + + #[test] + fn test_ascii_word_indices_various_cases() { + let s = "Hello, world! can't e.g. var1 123,456 foo_bar example.com 127.0.0.1:9090"; + let words: Vec<&str> = new_unicode_words_ascii(s).collect(); + let expected = vec![ + ("Hello"), // simple letters + ("world"), // skip comma+space, stop at '!' + ("can't"), // apostrophe joins letters + ("e.g"), + ("var1"), + ("123,456"), // digits+comma+digits + ("foo_bar"), + ("example.com"), + ("127.0.0.1"), + ("9090"), // port number + ]; + assert_eq!(words, expected); + } + + /// Strategy that yields every code-point from NUL (0) to DEL (127). + fn ascii_char() -> impl Strategy { + (0u8..=127).prop_map(|b| b as char) + } + + proptest! { + #![proptest_config(ProptestConfig::with_cases(10000))] + /// Fast path must equal general path for any ASCII input. + #[test] + fn proptest_ascii_matches_unicode_word_indices( + // Vec → String, length 0‒99 + s in proptest::collection::vec(ascii_char(), 0..100) + .prop_map(|v| v.into_iter().collect::()) + ) { + let fast: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).collect(); + let uni: Vec<(usize, &str)> = new_word_bound_indices(&s).collect(); + + prop_assert_eq!(fast, uni); + } + + /// Fast path must equal general path for any ASCII input, forwards and backwards. + #[test] + fn proptest_ascii_matches_unicode_word_indices_rev( + // Vec → String, length 0‒99 + s in proptest::collection::vec(ascii_char(), 0..100) + .prop_map(|v| v.into_iter().collect::()) + ) { + let fast_rev: Vec<(usize, &str)> = new_ascii_word_bound_indices(&s).rev().collect(); + let uni_rev : Vec<(usize, &str)> = new_word_bound_indices(&s).rev().collect(); + prop_assert_eq!(fast_rev, uni_rev); + } + } } pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy