Skip to content

Commit 93b7dff

Browse files
committed
Add WSegSpace support for in word boundaries from Unicode 11
1 parent 7d5cc07 commit 93b7dff

File tree

2 files changed

+27
-2
lines changed

2 files changed

+27
-2
lines changed

src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
//!
3030
//! let s = "The quick (\"brown\") fox";
3131
//! let w = s.split_word_bounds().collect::<Vec<&str>>();
32-
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
32+
//! let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
3333
//! assert_eq!(w, b);
3434
//! }
3535
//! ```
@@ -156,7 +156,7 @@ pub trait UnicodeSegmentation {
156156
/// ```
157157
/// # use self::unicode_segmentation::UnicodeSegmentation;
158158
/// let swu1 = "The quick (\"brown\") fox".split_word_bounds().collect::<Vec<&str>>();
159-
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"];
159+
/// let b: &[_] = &["The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox"];
160160
///
161161
/// assert_eq!(&swu1[..], b);
162162
/// ```

src/word.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ enum UWordBoundsState {
102102
FormatExtend(FormatExtendType),
103103
Zwj,
104104
Emoji,
105+
WSegSpace,
105106
}
106107

107108
// subtypes for FormatExtend state in UWordBoundsState
@@ -156,6 +157,8 @@ impl<'a> Iterator for UWordBounds<'a> {
156157
// Whether or not the previous category was ZWJ
157158
// ZWJs get collapsed, so this handles precedence of WB3c over WB4
158159
let mut prev_zwj;
160+
// If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
161+
let mut skipped_format_extend = false;
159162
for (curr, ch) in self.string.char_indices() {
160163
idx = curr;
161164
prev_zwj = cat == wd::WC_ZWJ;
@@ -177,6 +180,7 @@ impl<'a> Iterator for UWordBounds<'a> {
177180
if state != Start {
178181
match cat {
179182
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => {
183+
skipped_format_extend = true;
180184
continue
181185
}
182186
_ => {}
@@ -219,6 +223,7 @@ impl<'a> Iterator for UWordBounds<'a> {
219223
wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
220224
wd::WC_LF | wd::WC_Newline => break, // rule WB3a
221225
wd::WC_ZWJ => Zwj, // rule WB3c
226+
wd::WC_WSegSpace => WSegSpace, // rule WB3d
222227
_ => {
223228
if let Some(ncat) = self.get_next_cat(idx) { // rule WB4
224229
if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ {
@@ -230,6 +235,13 @@ impl<'a> Iterator for UWordBounds<'a> {
230235
break; // rule WB999
231236
}
232237
},
238+
WSegSpace => match cat {
239+
wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
240+
_ => {
241+
take_curr = false;
242+
break;
243+
}
244+
},
233245
Zwj => {
234246
// We already handle WB3c above.
235247
take_curr = false;
@@ -371,6 +383,8 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
371383
let mut savestate = Start;
372384
let mut cat = wd::WC_Any;
373385

386+
let mut skipped_format_extend = false;
387+
374388
for (curr, ch) in self.string.char_indices().rev() {
375389
previdx = idx;
376390
idx = curr;
@@ -409,6 +423,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
409423
state = savestate;
410424
previdx = saveidx;
411425
take_cat = false;
426+
skipped_format_extend = true;
412427
}
413428

414429
// Don't use `continue` in this match without updating `catb`
@@ -427,6 +442,7 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
427442
saveidx = idx;
428443
FormatExtend(AcceptQLetter) // rule WB7a
429444
},
445+
wd::WC_WSegSpace => WSegSpace,
430446
wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
431447
if state == Start {
432448
if cat == wd::WC_LF {
@@ -451,6 +467,15 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
451467
break;
452468
}
453469
},
470+
WSegSpace => match cat { // rule WB3d
471+
wd::WC_WSegSpace if !skipped_format_extend => {
472+
WSegSpace
473+
}
474+
_ => {
475+
take_curr = false;
476+
break;
477+
}
478+
},
454479
Letter | HLetter => match cat {
455480
wd::WC_ALetter => Letter, // rule WB5
456481
wd::WC_Hebrew_Letter => HLetter, // rule WB5

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy