Skip to content

Commit 98c9457

Browse files
committed
Update emoji rules in word boundaries to Unicode 11
1 parent 4d58f18 commit 98c9457

File tree

1 file changed

+19
-29
lines changed

1 file changed

+19
-29
lines changed

src/word.rs

Lines changed: 19 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,11 @@ enum RegionalState {
122122
Unknown,
123123
}
124124

125+
fn is_emoji(ch: char) -> bool {
126+
use tables::emoji;
127+
emoji::emoji_category(ch) == emoji::EmojiCat::EC_Extended_Pictographic
128+
}
129+
125130
impl<'a> Iterator for UWordBounds<'a> {
126131
type Item = &'a str;
127132

@@ -182,26 +187,18 @@ impl<'a> Iterator for UWordBounds<'a> {
182187
// WB4 makes all ZWJs collapse into the previous state
183188
// but you can still be in a Zwj state if you started with Zwj
184189
//
185-
// This means that Zwj + Extend will collapse into Zwj, which is wrong,
186-
// since Extend has a boundary with following EBG/GAZ chars but ZWJ doesn't,
187-
// and that rule (WB3c) has higher priority
188-
//
189-
// Additionally, Emoji_Base+ZWJ+(EBG/GAZ) will collapse into Emoji_Base+EBG/GAZ
190-
// which won't have a boundary even though EB+ZWJ+GAZ should have a boundary.
190+
// This means that an EP + Zwj will collapse into EP, which is wrong,
191+
// since EP+EP is not a boundary but EP+ZWJ+EP is
191192
//
192193
// Thus, we separately keep track of whether or not the last character
193194
// was a ZWJ. This is an additional bit of state tracked outside of the
194195
// state enum; the state enum represents the last non-zwj state encountered.
195196
// When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
196197
// however we are in the previous state for the purposes of all other rules.
197198
if prev_zwj {
198-
match cat {
199-
wd::WC_Glue_After_Zwj => continue,
200-
wd::WC_E_Base_GAZ => {
201-
state = Emoji;
202-
continue;
203-
},
204-
_ => ()
199+
if is_emoji(ch) {
200+
state = Emoji;
201+
continue;
205202
}
206203
}
207204
// Don't use `continue` in this match without updating `cat`
@@ -222,7 +219,6 @@ impl<'a> Iterator for UWordBounds<'a> {
222219
wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
223220
wd::WC_LF | wd::WC_Newline => break, // rule WB3a
224221
wd::WC_ZWJ => Zwj, // rule WB3c
225-
wd::WC_E_Base | wd::WC_E_Base_GAZ => Emoji, // rule WB14
226222
_ => {
227223
if let Some(ncat) = self.get_next_cat(idx) { // rule WB4
228224
if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ {
@@ -235,9 +231,7 @@ impl<'a> Iterator for UWordBounds<'a> {
235231
}
236232
},
237233
Zwj => {
238-
// We already handle WB3c above. At this point,
239-
// the current category is not GAZ or EBG,
240-
// or the previous character was not actually a ZWJ
234+
// We already handle WB3c above.
241235
take_curr = false;
242236
break;
243237
}
@@ -313,12 +307,10 @@ impl<'a> Iterator for UWordBounds<'a> {
313307
}
314308
},
315309
Regional(_) => unreachable!("RegionalState::Unknown should not occur on forward iteration"),
316-
Emoji => match cat { // rule WB14
317-
wd::WC_E_Modifier => state,
318-
_ => {
319-
take_curr = false;
320-
break;
321-
}
310+
Emoji => {
311+
// We already handle WB3c above. If you've reached this point, the emoji sequence is over.
312+
take_curr = false;
313+
break;
322314
},
323315
FormatExtend(t) => match t { // handle FormatExtends depending on what type
324316
RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
@@ -422,20 +414,19 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
422414
// Don't use `continue` in this match without updating `catb`
423415
state = match state {
424416
Start | FormatExtend(AcceptAny) => match cat {
417+
_ if is_emoji(ch) => Zwj,
425418
wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
426419
wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
427420
wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
428421
wd::WC_Katakana => Katakana, // rule WB13, WB13b
429422
wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
430423
wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
431-
wd::WC_Glue_After_Zwj | wd::WC_E_Base_GAZ => Zwj, // rule WB3c
432424
// rule WB4:
433425
wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny),
434426
wd::WC_Single_Quote => {
435427
saveidx = idx;
436428
FormatExtend(AcceptQLetter) // rule WB7a
437429
},
438-
wd::WC_E_Modifier => Emoji, // rule WB14
439430
wd::WC_CR | wd::WC_LF | wd::WC_Newline => {
440431
if state == Start {
441432
if cat == wd::WC_LF {
@@ -539,11 +530,10 @@ impl<'a> DoubleEndedIterator for UWordBounds<'a> {
539530
break;
540531
}
541532
},
542-
Emoji => match cat { // rule WB14
543-
wd::WC_E_Base | wd::WC_E_Base_GAZ => {
533+
Emoji => {
534+
if is_emoji(ch) { // rule WB3c
544535
Zwj
545-
},
546-
_ => {
536+
} else {
547537
take_curr = false;
548538
break;
549539
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy