Skip to content

Commit cea3ce6

Browse files
authored
Merge pull request #91 from basile-henry/basile/unicode-word-indices
Add unicode_word_indices
2 parents 247c0b1 + 8bd6e3a commit cea3ce6

File tree

2 files changed

+72
-5
lines changed

2 files changed

+72
-5
lines changed

src/lib.rs

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ extern crate quickcheck;
6666
pub use grapheme::{Graphemes, GraphemeIndices};
6767
pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
6868
pub use tables::UNICODE_VERSION;
69-
pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
69+
pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords, UnicodeWordIndices};
7070
pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};
7171

7272
mod grapheme;
@@ -146,6 +146,30 @@ pub trait UnicodeSegmentation {
146146
/// ```
147147
fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>;
148148

149+
/// Returns an iterator over the words of `self`, separated on
150+
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
151+
/// offsets.
152+
///
153+
/// Here, "words" are just those substrings which, after splitting on
154+
/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
155+
/// substring must contain at least one character with the
156+
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
157+
/// property, or with
158+
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
159+
///
160+
/// # Example
161+
///
162+
/// ```
163+
/// # use self::unicode_segmentation::UnicodeSegmentation;
164+
/// let uwis = "The quick (\"brown\") fox can't jump 32.3 feet, right?";
165+
/// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();
166+
/// let b: &[_] = &[(0, "The"), (4, "quick"), (12, "brown"), (20, "fox"), (24, "can't"),
167+
/// (30, "jump"), (35, "32.3"), (40, "feet"), (46, "right")];
168+
///
169+
/// assert_eq!(&uwi1[..], b);
170+
/// ```
171+
fn unicode_word_indices<'a>(&'a self) -> UnicodeWordIndices<'a>;
172+
149173
/// Returns an iterator over substrings of `self` separated on
150174
/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
151175
///
@@ -249,6 +273,11 @@ impl UnicodeSegmentation for str {
249273
word::new_unicode_words(self)
250274
}
251275

276+
#[inline]
277+
fn unicode_word_indices(&self) -> UnicodeWordIndices {
278+
word::new_unicode_word_indices(self)
279+
}
280+
252281
#[inline]
253282
fn split_word_bounds(&self) -> UWordBounds {
254283
word::new_word_bounds(self)

src/word.rs

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,34 @@ impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
4040
fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
4141
}
4242

43+
/// An iterator over the substrings of a string which, after splitting the string on
44+
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
45+
/// contain any characters with the
46+
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
47+
/// property, or with
48+
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
49+
/// This iterator also provides the byte offsets for each substring.
50+
///
51+
/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
52+
/// its documentation for more.
53+
///
54+
/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
55+
/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
56+
pub struct UnicodeWordIndices<'a> {
57+
inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
58+
}
59+
60+
impl<'a> Iterator for UnicodeWordIndices<'a> {
61+
type Item = (usize, &'a str);
62+
63+
#[inline]
64+
fn next(&mut self) -> Option<(usize, &'a str)> { self.inner.next() }
65+
}
66+
impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
67+
#[inline]
68+
fn next_back(&mut self) -> Option<(usize, &'a str)> { self.inner.next_back() }
69+
}
70+
4371
/// External iterator for a string's
4472
/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
4573
///
@@ -671,12 +699,22 @@ pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
671699
}
672700

673701
#[inline]
674-
pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
675-
use super::UnicodeSegmentation;
702+
fn has_alphanumeric(s: &&str) -> bool {
676703
use tables::util::is_alphanumeric;
677704

678-
fn has_alphanumeric(s: &&str) -> bool { s.chars().any(|c| is_alphanumeric(c)) }
679-
let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
705+
s.chars().any(|c| is_alphanumeric(c))
706+
}
707+
708+
#[inline]
709+
pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
710+
use super::UnicodeSegmentation;
680711

681712
UnicodeWords { inner: s.split_word_bounds().filter(has_alphanumeric) }
682713
}
714+
715+
#[inline]
716+
pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> {
717+
use super::UnicodeSegmentation;
718+
719+
UnicodeWordIndices { inner: s.split_word_bound_indices().filter(|(_, c)| has_alphanumeric(c)) }
720+
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy