Skip to content

Commit 8e40640

Browse files
Support Grapheme_Cluster_Break=Prepend (#62)
These characters act like combining marks, except they go before the base character instead of after it.
1 parent afab363 commit 8e40640

File tree

4 files changed

+42
-9
lines changed

4 files changed

+42
-9
lines changed

scripts/unicode.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
# - ReadMe.txt
2121
# - Scripts.txt
2222
# - UnicodeData.txt
23+
# - auxiliary/GraphemeBreakProperty.txt
2324
# - emoji/emoji-data.txt
2425
# - emoji/emoji-variation-sequences.txt
2526
# - extracted/DerivedGeneralCategory.txt
@@ -526,6 +527,21 @@ def load_zero_widths() -> list[bool]:
526527
zw_map[0x0891] = True
527528
zw_map[0x08E2] = True
528529

530+
# `[:Grapheme_Cluster_Break=Prepend:]-[:Prepended_Concatenation_Mark:]`
531+
gcb_prepend = set()
532+
load_property(
533+
"auxiliary/GraphemeBreakProperty.txt",
534+
"Prepend",
535+
lambda cp: gcb_prepend.add(cp),
536+
)
537+
load_property(
538+
"PropList.txt",
539+
"Prepended_Concatenation_Mark",
540+
lambda cp: gcb_prepend.remove(cp),
541+
)
542+
for cp in gcb_prepend:
543+
zw_map[cp] = True
544+
529545
# HANGUL CHOSEONG FILLER
530546
# U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
531547
# zero width. However, the expected usage is to combine it with vowel or trailing jamo

src/lib.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
//! - **[Buginese]**: `"\u{1A15}\u{1A17}\u{200D}\u{1A10}"` (<a, -i> ya, `ᨕᨗ‍ᨐ`) has total width 1.
7474
//! - **[Hebrew]**: `"א\u{200D}ל"` (Alef-Lamed, `א‍ל`) has total width 1.
7575
//! - **[Khmer]**: Coeng signs consisting of `'\u{17D2}'` followed by a character in
76-
//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
76+
//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
7777
//! have width 0.
7878
//! - **[Lisu]**: Tone letter combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
7979
//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. For example: `ꓹꓼ`
@@ -113,6 +113,8 @@
113113
//! - [`'\u{0890}'` POUND MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0890),
114114
//! - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and
115115
//! - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2).
116+
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Cluster_Break%3DPrepend%7D-%5Cp%7BPrepended_Concatenation_Mark%7D)
117+
//! with the [`Grapheme_Extend=Prepend`] property, that are not also [`Prepended_Concatenation_Mark`]s.
116118
//! - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA).
117119
//! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
118120
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
@@ -132,6 +134,7 @@
132134
//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
133135
//! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation
134136
//! [`General_Category`]: https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142
137+
//! [`Grapheme_Extend=Prepend`]: https://www.unicode.org/reports/tr29/#Prepend
135138
//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
136139
//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593
137140
//! [`Joining_Group`]: https://www.unicode.org/versions/Unicode14.0.0/ch09.pdf#G36862

src/tables.rs

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1162,7 +1162,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
11621162
],
11631163
[
11641164
0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15,
1165-
0x44, 0x01, 0x54, 0x55, 0x51, 0x55, 0x15, 0x55, 0x55, 0x05, 0x55, 0x55, 0x55, 0x55, 0x55,
1165+
0x44, 0x01, 0x54, 0x55, 0x41, 0x55, 0x15, 0x55, 0x55, 0x05, 0x55, 0x55, 0x55, 0x55, 0x55,
11661166
0x55, 0x55,
11671167
],
11681168
[
@@ -1532,7 +1532,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
15321532
],
15331533
[
15341534
0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x00,
1535-
0x40, 0x55, 0x55, 0x01, 0x14, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1535+
0x40, 0x05, 0x55, 0x01, 0x14, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
15361536
0x55, 0x55,
15371537
],
15381538
[
@@ -1587,7 +1587,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
15871587
],
15881588
[
15891589
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x54, 0x55, 0x15,
1590-
0x44, 0x15, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1590+
0x04, 0x11, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
15911591
0x55, 0x55,
15921592
],
15931593
[
@@ -1596,12 +1596,12 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
15961596
0x55, 0x55,
15971597
],
15981598
[
1599-
0x01, 0x00, 0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15, 0x00, 0x14,
1599+
0x01, 0x00, 0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15, 0x00, 0x04,
16001600
0x40, 0x55, 0x15, 0x55, 0x55, 0x01, 0x40, 0x01, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
16011601
0x55, 0x55,
16021602
],
16031603
[
1604-
0x55, 0x55, 0x05, 0x00, 0x00, 0x40, 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1604+
0x55, 0x00, 0x00, 0x00, 0x00, 0x40, 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
16051605
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
16061606
0x55, 0x55,
16071607
],
@@ -1617,7 +1617,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
16171617
],
16181618
[
16191619
0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x01, 0x40, 0x45,
1620-
0x10, 0x00, 0x10, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1620+
0x10, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
16211621
0x55, 0x55,
16221622
],
16231623
[
@@ -1631,7 +1631,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
16311631
0x55, 0x55,
16321632
],
16331633
[
1634-
0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x40,
1634+
0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x40,
16351635
0x55, 0x44, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
16361636
0x55, 0x55,
16371637
],
@@ -1994,7 +1994,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
19941994
/// Sorted list of codepoint ranges (inclusive)
19951995
/// that are zero-width but not `Joining_Type=Transparent`
19961996
/// FIXME: can we get better compression?
1997-
static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [
1997+
static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 53] = [
19981998
([0x05, 0x06, 0x00], [0x05, 0x06, 0x00]),
19991999
([0x90, 0x08, 0x00], [0x91, 0x08, 0x00]),
20002000
([0xE2, 0x08, 0x00], [0xE2, 0x08, 0x00]),
@@ -2010,6 +2010,7 @@ static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [
20102010
([0xCA, 0x0C, 0x00], [0xCB, 0x0C, 0x00]),
20112011
([0xD5, 0x0C, 0x00], [0xD6, 0x0C, 0x00]),
20122012
([0x3E, 0x0D, 0x00], [0x3E, 0x0D, 0x00]),
2013+
([0x4E, 0x0D, 0x00], [0x4E, 0x0D, 0x00]),
20132014
([0x57, 0x0D, 0x00], [0x57, 0x0D, 0x00]),
20142015
([0xCF, 0x0D, 0x00], [0xCF, 0x0D, 0x00]),
20152016
([0xDF, 0x0D, 0x00], [0xDF, 0x0D, 0x00]),
@@ -2028,12 +2029,19 @@ static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [
20282029
([0xCB, 0xD7, 0x00], [0xFB, 0xD7, 0x00]),
20292030
([0x9E, 0xFF, 0x00], [0xA0, 0xFF, 0x00]),
20302031
([0xF0, 0xFF, 0x00], [0xF8, 0xFF, 0x00]),
2032+
([0xC2, 0x11, 0x01], [0xC3, 0x11, 0x01]),
20312033
([0x3E, 0x13, 0x01], [0x3E, 0x13, 0x01]),
20322034
([0x57, 0x13, 0x01], [0x57, 0x13, 0x01]),
20332035
([0xB0, 0x14, 0x01], [0xB0, 0x14, 0x01]),
20342036
([0xBD, 0x14, 0x01], [0xBD, 0x14, 0x01]),
20352037
([0xAF, 0x15, 0x01], [0xAF, 0x15, 0x01]),
20362038
([0x30, 0x19, 0x01], [0x30, 0x19, 0x01]),
2039+
([0x3F, 0x19, 0x01], [0x3F, 0x19, 0x01]),
2040+
([0x41, 0x19, 0x01], [0x41, 0x19, 0x01]),
2041+
([0x3A, 0x1A, 0x01], [0x3A, 0x1A, 0x01]),
2042+
([0x84, 0x1A, 0x01], [0x89, 0x1A, 0x01]),
2043+
([0x46, 0x1D, 0x01], [0x46, 0x1D, 0x01]),
2044+
([0x02, 0x1F, 0x01], [0x02, 0x1F, 0x01]),
20372045
([0x65, 0xD1, 0x01], [0x65, 0xD1, 0x01]),
20382046
([0x6E, 0xD1, 0x01], [0x72, 0xD1, 0x01]),
20392047
([0x00, 0x00, 0x0E], [0x00, 0x00, 0x0E]),

tests/tests.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,12 @@ fn test_prepended_concatenation_marks() {
110110
}
111111
}
112112

113+
#[test]
114+
fn test_gcb_prepend() {
115+
assert_width!("ൎഉ", 1, 1);
116+
assert_width!("\u{11A89}", 0, 0);
117+
}
118+
113119
#[test]
114120
fn test_interlinear_annotation_chars() {
115121
assert_width!('\u{FFF9}', Some(1), Some(1));

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy