diff --git a/scripts/unicode.py b/scripts/unicode.py index 18e71d0..e3be355 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -20,6 +20,7 @@ # - ReadMe.txt # - Scripts.txt # - UnicodeData.txt +# - auxiliary/GraphemeBreakProperty.txt # - emoji/emoji-data.txt # - emoji/emoji-variation-sequences.txt # - extracted/DerivedGeneralCategory.txt @@ -526,6 +527,21 @@ def load_zero_widths() -> list[bool]: zw_map[0x0891] = True zw_map[0x08E2] = True + # `[:Grapheme_Cluster_Break=Prepend:]-[:Prepended_Concatenation_Mark:]` + gcb_prepend = set() + load_property( + "auxiliary/GraphemeBreakProperty.txt", + "Prepend", + lambda cp: gcb_prepend.add(cp), + ) + load_property( + "PropList.txt", + "Prepended_Concatenation_Mark", + lambda cp: gcb_prepend.remove(cp), + ) + for cp in gcb_prepend: + zw_map[cp] = True + # HANGUL CHOSEONG FILLER # U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have # zero width. However, the expected usage is to combine it with vowel or trailing jamo diff --git a/src/lib.rs b/src/lib.rs index d83a6c8..4297e11 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -73,7 +73,7 @@ //! - **[Buginese]**: `"\u{1A15}\u{1A17}\u{200D}\u{1A10}"` ( ya, `ᨕᨗ‍ᨐ`) has total width 1. //! - **[Hebrew]**: `"א\u{200D}ל"` (Alef-Lamed, `א‍ל`) has total width 1. //! - **[Khmer]**: Coeng signs consisting of `'\u{17D2}'` followed by a character in -//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'` +//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'` //! have width 0. //! - **[Lisu]**: Tone letter combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'` //! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. For example: `ꓹꓼ` @@ -113,6 +113,8 @@ //! - [`'\u{0890}'` POUND MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0890), //! - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and //! - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2). +//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Cluster_Break%3DPrepend%7D-%5Cp%7BPrepended_Concatenation_Mark%7D) +//! with the [`Grapheme_Extend=Prepend`] property, that are not also [`Prepended_Concatenation_Mark`]s. //! - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA). //! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D) //! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2. @@ -132,6 +134,7 @@ //! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1 //! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation //! [`General_Category`]: https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142 +//! [`Grapheme_Extend=Prepend`]: https://www.unicode.org/reports/tr29/#Prepend //! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443 //! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593 //! [`Joining_Group`]: https://www.unicode.org/versions/Unicode14.0.0/ch09.pdf#G36862 diff --git a/src/tables.rs b/src/tables.rs index f7a7a86..c8a4aba 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -1162,7 +1162,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ ], [ 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15, - 0x44, 0x01, 0x54, 0x55, 0x51, 0x55, 0x15, 0x55, 0x55, 0x05, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x44, 0x01, 0x54, 0x55, 0x41, 0x55, 0x15, 0x55, 0x55, 0x05, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, ], [ @@ -1532,7 +1532,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ ], [ 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x00, - 0x40, 0x55, 0x55, 0x01, 0x14, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x40, 0x05, 0x55, 0x01, 0x14, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, ], [ @@ -1587,7 +1587,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ ], [ 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x54, 0x55, 0x15, - 0x44, 0x15, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x04, 0x11, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, ], [ @@ -1596,12 +1596,12 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ 0x55, 0x55, ], [ - 0x01, 0x00, 0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15, 0x00, 0x14, + 0x01, 0x00, 0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15, 0x00, 0x04, 0x40, 0x55, 0x15, 0x55, 0x55, 0x01, 0x40, 0x01, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, ], [ - 0x55, 0x55, 0x05, 0x00, 0x00, 0x40, 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x55, 0x00, 0x00, 0x00, 0x00, 0x40, 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, ], @@ -1617,7 +1617,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ ], [ 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x01, 0x40, 0x45, - 0x10, 0x00, 0x10, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x10, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, ], [ @@ -1631,7 +1631,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ 0x55, 0x55, ], [ - 0x50, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x40, + 0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x40, 0x55, 0x44, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, ], @@ -1994,7 +1994,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ /// Sorted list of codepoint ranges (inclusive) /// that are zero-width but not `Joining_Type=Transparent` /// FIXME: can we get better compression? -static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [ +static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 53] = [ ([0x05, 0x06, 0x00], [0x05, 0x06, 0x00]), ([0x90, 0x08, 0x00], [0x91, 0x08, 0x00]), ([0xE2, 0x08, 0x00], [0xE2, 0x08, 0x00]), @@ -2010,6 +2010,7 @@ static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [ ([0xCA, 0x0C, 0x00], [0xCB, 0x0C, 0x00]), ([0xD5, 0x0C, 0x00], [0xD6, 0x0C, 0x00]), ([0x3E, 0x0D, 0x00], [0x3E, 0x0D, 0x00]), + ([0x4E, 0x0D, 0x00], [0x4E, 0x0D, 0x00]), ([0x57, 0x0D, 0x00], [0x57, 0x0D, 0x00]), ([0xCF, 0x0D, 0x00], [0xCF, 0x0D, 0x00]), ([0xDF, 0x0D, 0x00], [0xDF, 0x0D, 0x00]), @@ -2028,12 +2029,19 @@ static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); 45] = [ ([0xCB, 0xD7, 0x00], [0xFB, 0xD7, 0x00]), ([0x9E, 0xFF, 0x00], [0xA0, 0xFF, 0x00]), ([0xF0, 0xFF, 0x00], [0xF8, 0xFF, 0x00]), + ([0xC2, 0x11, 0x01], [0xC3, 0x11, 0x01]), ([0x3E, 0x13, 0x01], [0x3E, 0x13, 0x01]), ([0x57, 0x13, 0x01], [0x57, 0x13, 0x01]), ([0xB0, 0x14, 0x01], [0xB0, 0x14, 0x01]), ([0xBD, 0x14, 0x01], [0xBD, 0x14, 0x01]), ([0xAF, 0x15, 0x01], [0xAF, 0x15, 0x01]), ([0x30, 0x19, 0x01], [0x30, 0x19, 0x01]), + ([0x3F, 0x19, 0x01], [0x3F, 0x19, 0x01]), + ([0x41, 0x19, 0x01], [0x41, 0x19, 0x01]), + ([0x3A, 0x1A, 0x01], [0x3A, 0x1A, 0x01]), + ([0x84, 0x1A, 0x01], [0x89, 0x1A, 0x01]), + ([0x46, 0x1D, 0x01], [0x46, 0x1D, 0x01]), + ([0x02, 0x1F, 0x01], [0x02, 0x1F, 0x01]), ([0x65, 0xD1, 0x01], [0x65, 0xD1, 0x01]), ([0x6E, 0xD1, 0x01], [0x72, 0xD1, 0x01]), ([0x00, 0x00, 0x0E], [0x00, 0x00, 0x0E]), diff --git a/tests/tests.rs b/tests/tests.rs index 4f713e7..8ff0c6b 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -110,6 +110,12 @@ fn test_prepended_concatenation_marks() { } } +#[test] +fn test_gcb_prepend() { + assert_width!("ൎഉ", 1, 1); + assert_width!("\u{11A89}", 0, 0); +} + #[test] fn test_interlinear_annotation_chars() { assert_width!('\u{FFF9}', Some(1), Some(1)); pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy