Skip to content

Commit 4d58f18

Browse files
committed
Update grapheme segmentation to Unicode 11
1 parent 504ba99 commit 4d58f18

File tree

2 files changed

+14
-11
lines changed

2 files changed

+14
-11
lines changed

src/grapheme.rs

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,8 @@ enum GraphemeState {
147147
// The codepoint after is a Regional Indicator Symbol, so a boundary iff
148148
// it is preceded by an even number of RIS codepoints. (GB12, GB13)
149149
Regional,
150-
// The codepoint after is in the E_Modifier category, so whether it's a boundary
151-
// depends on pre-context according to GB10.
150+
// The codepoint after is Extended_Pictographic,
151+
// so whether it's a boundary depends on pre-context according to GB11.
152152
Emoji,
153153
}
154154

@@ -239,11 +239,7 @@ fn check_pair(before: GraphemeCat, after: GraphemeCat) -> PairResult {
239239
(_, GC_ZWJ) => NotBreak, // GB9
240240
(_, GC_SpacingMark) => Extended, // GB9a
241241
(GC_Prepend, _) => Extended, // GB9b
242-
(GC_E_Base, GC_E_Modifier) => NotBreak, // GB10
243-
(GC_E_Base_GAZ, GC_E_Modifier) => NotBreak, // GB10
244-
(GC_Extend, GC_E_Modifier) => Emoji, // GB10
245-
(GC_ZWJ, GC_Glue_After_Zwj) => NotBreak, // GB11
246-
(GC_ZWJ, GC_E_Base_GAZ) => NotBreak, // GB11
242+
(GC_ZWJ, GC_Extended_Pictographic) => Emoji, // GB11
247243
(GC_Regional_Indicator, GC_Regional_Indicator) => Regional, // GB12, GB13
248244
(_, _) => Break, // GB999
249245
}
@@ -415,10 +411,17 @@ impl GraphemeCursor {
415411

416412
fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
417413
use tables::grapheme as gr;
418-
for ch in chunk.chars().rev() {
414+
let mut iter = chunk.chars().rev();
415+
if let Some(ch) = iter.next() {
416+
if gr::grapheme_category(ch) != gr::GC_ZWJ {
417+
self.decide(true);
418+
return;
419+
}
420+
}
421+
for ch in iter {
419422
match gr::grapheme_category(ch) {
420423
gr::GC_Extend => (),
421-
gr::GC_E_Base | gr::GC_E_Base_GAZ => {
424+
gr::GC_Extended_Pictographic => {
422425
self.decide(false);
423426
return;
424427
}
@@ -484,7 +487,7 @@ impl GraphemeCursor {
484487
let mut need_pre_context = true;
485488
match self.cat_after.unwrap() {
486489
gr::GC_Regional_Indicator => self.state = GraphemeState::Regional,
487-
gr::GC_E_Modifier => self.state = GraphemeState::Emoji,
490+
gr::GC_Extended_Pictographic => self.state = GraphemeState::Emoji,
488491
_ => need_pre_context = self.cat_before.is_none(),
489492
}
490493
if need_pre_context {

src/test.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ fn test_graphemes() {
3737
// cartwheel emoji followed by two fitzpatrick skin tone modifiers
3838
// (test case from issue #19)
3939
("\u{1F938}\u{1F3FE}\u{1F3FE}",
40-
&["\u{1F938}\u{1F3FE}", "\u{1F3FE}"]),
40+
&["\u{1F938}\u{1F3FE}\u{1F3FE}"]),
4141
];
4242

4343
for &(s, g) in TEST_SAME.iter().chain(EXTRA_SAME) {

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy