From 8bd6e3a2d66e32fc1cb3a351cbc3b6a6a0535734 Mon Sep 17 00:00:00 2001 From: Basile Henry Date: Sun, 7 Mar 2021 19:18:37 +0100 Subject: [PATCH] Add unicode_word_indices The iterator UnicodeWordIndices is similar to UnicodeWord but also provides byte offsets for each word --- src/lib.rs | 31 ++++++++++++++++++++++++++++++- src/word.rs | 46 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 72 insertions(+), 5 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index b0ed2d1..ed74f8f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -66,7 +66,7 @@ extern crate quickcheck; pub use grapheme::{Graphemes, GraphemeIndices}; pub use grapheme::{GraphemeCursor, GraphemeIncomplete}; pub use tables::UNICODE_VERSION; -pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords}; +pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords, UnicodeWordIndices}; pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences}; mod grapheme; @@ -146,6 +146,30 @@ pub trait UnicodeSegmentation { /// ``` fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>; + /// Returns an iterator over the words of `self`, separated on + /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their + /// offsets. + /// + /// Here, "words" are just those substrings which, after splitting on + /// UAX#29 word boundaries, contain any alphanumeric characters. That is, the + /// substring must contain at least one character with the + /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) + /// property, or with + /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). + /// + /// # Example + /// + /// ``` + /// # use self::unicode_segmentation::UnicodeSegmentation; + /// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?"; + /// let uwi1 = uwis.unicode_word_indices().collect::>(); + /// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"), + /// (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")]; + /// + /// assert_eq!(&uwi1[..], b); + /// ``` + fn unicode_word_indices<'a>(&'a self) -> UnicodeWordIndices<'a>; + /// Returns an iterator over substrings of `self` separated on /// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). /// @@ -249,6 +273,11 @@ impl UnicodeSegmentation for str { word::new_unicode_words(self) } + #[inline] + fn unicode_word_indices(&self) -> UnicodeWordIndices { + word::new_unicode_word_indices(self) + } + #[inline] fn split_word_bounds(&self) -> UWordBounds { word::new_word_bounds(self) diff --git a/src/word.rs b/src/word.rs index 179d122..b9bd956 100644 --- a/src/word.rs +++ b/src/word.rs @@ -40,6 +40,34 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> { fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() } } +/// An iterator over the substrings of a string which, after splitting the string on +/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), +/// contain any characters with the +/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) +/// property, or with +/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). +/// This iterator also provides the byte offsets for each substring. +/// +/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See +/// its documentation for more. +/// +/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices +/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html +pub struct UnicodeWordIndices<'a> { + inner: Filter, fn(&(usize, &str)) -> bool>, +} + +impl<'a> Iterator for UnicodeWordIndices<'a> { + type Item = (usize, &'a str); + + #[inline] + fn next(&mut self) -> Option<(usize, &'a str)> { self.inner.next() } +} +impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> { + #[inline] + fn next_back(&mut self) -> Option<(usize, &'a str)> { self.inner.next_back() } +} + /// External iterator for a string's /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). /// @@ -671,12 +699,22 @@ pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> { } #[inline] -pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> { - use super::UnicodeSegmentation; +fn has_alphanumeric(s: &&str) -> bool { use tables::util::is_alphanumeric; - fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) } - let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer + s.chars().any(|c| is_alphanumeric(c)) +} + +#[inline] +pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> { + use super::UnicodeSegmentation; UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) } } + +#[inline] +pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> { + use super::UnicodeSegmentation; + + UnicodeWordIndices { inner: s.split_word_bound_indices().filter(|(_, c)| has_alphanumeric(c)) } +} pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy