Skip to content

Commit 6ab41d7

Browse files
Unicode 16: Initial support
Includes Kirat Rai normalization behavior.
1 parent 82d7136 commit 6ab41d7

File tree

3 files changed

+1116
-121
lines changed

3 files changed

+1116
-121
lines changed

scripts/unicode.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
from itertools import batched
4444
from typing import Callable, Iterable
4545

46-
UNICODE_VERSION = "15.1.0"
46+
UNICODE_VERSION = "16.0.0"
4747
"""The version of the Unicode data files to download."""
4848

4949
NUM_CODEPOINTS = 0x110000
@@ -264,6 +264,12 @@ class WidthState(enum.IntEnum):
264264
TAG_A6_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1110
265265
"(\\uE0061..=\\uE007A){6} \\uE007F \\u200D `Emoji_Presentation`"
266266

267+
# Kirat Rai
268+
KIRAT_RAI_VOWEL_SIGN_E = 0b0000_0000_0010_0000
269+
"\\u16D67 (\\u16D67 \\u16D67)+ and canonical equivalents"
270+
KIRAT_RAI_VOWEL_SIGN_AI = 0b0000_0000_0010_0001
271+
"(\\u16D68)+ and canonical equivalents"
272+
267273
# VARIATION SELECTORS
268274

269275
# Text presentation sequences (not CJK)
@@ -639,6 +645,8 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
639645
([0xA4FD], WidthState.LISU_TONE_LETTER_MYA_NA_JEU),
640646
([0xFE0F], WidthState.VARIATION_SELECTOR_16),
641647
([0x10C03], WidthState.OLD_TURKIC_LETTER_ORKHON_I),
648+
([0x16D67], WidthState.KIRAT_RAI_VOWEL_SIGN_E),
649+
([0x16D68], WidthState.KIRAT_RAI_VOWEL_SIGN_AI),
642650
(emoji_presentation, WidthState.EMOJI_PRESENTATION),
643651
(emoji_modifiers, WidthState.EMOJI_MODIFIER),
644652
(regional_indicators, WidthState.REGIONAL_INDICATOR),
@@ -1496,6 +1504,22 @@ def lookup_fns(
14961504
return (0, WidthInfo::EMOJI_PRESENTATION)
14971505
}}
14981506
1507+
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D63}}') => {{
1508+
return (0, WidthInfo::DEFAULT);
1509+
}}
1510+
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D67}}') => {{
1511+
return (0, WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI);
1512+
}}
1513+
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D68}}') => {{
1514+
return (1, WidthInfo::KIRAT_RAI_VOWEL_SIGN_E);
1515+
}}
1516+
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_E, '\\u{{16D69}}') => {{
1517+
return (0, WidthInfo::DEFAULT);
1518+
}}
1519+
(WidthInfo::KIRAT_RAI_VOWEL_SIGN_AI, '\\u{{16D63}}') => {{
1520+
return (0, WidthInfo::DEFAULT);
1521+
}}
1522+
14991523
// Fallback
15001524
_ => {{}}
15011525
}}

src/lib.rs

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
//! - Script-specific ligatures:
6666
//! - For all the following ligatures, the insertion of any number of [default-ignorable][`Default_Ignorable_Code_Point`]
6767
//! [combining marks] anywhere in the sequence will not change the total width. In addition, for all non-Arabic
68-
//! ligatures, the insertion of any number of [`'\u{200D}'` ZERO WIDTH JOINER](https://www.unicode.org/versions/Unicode15.0.0/ch23.pdf#G23126)s
68+
//! ligatures, the insertion of any number of [`'\u{200D}'` ZERO WIDTH JOINER](https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-23/#G23126)s
6969
//! will not affect the width.
7070
//! - **[Arabic]**: A character sequence consisting of one character with [`Joining_Group`]`=Lam`,
7171
//! followed by any number of characters with [`Joining_Type`]`=Transparent`, followed by one character
@@ -75,6 +75,7 @@
7575
//! - **[Khmer]**: Coeng signs consisting of `'\u{17D2}'` followed by a character in
7676
//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
7777
//! have width 0.
78+
//! - **[Kirat Rai]**: Any sequence canonically equivalent to `\u{16D68}`, `\u{16D69}`, or `\u{16D6A}` has total width 1.
7879
//! - **[Lisu]**: Tone letter combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
7980
//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. For example: `ꓹꓼ`
8081
//! - **[Old Turkic]**: `"\u{10C32}\u{200D}\u{10C03}"` (`𐰲‍𐰃`) has total width 1.
@@ -130,18 +131,18 @@
130131
//! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338
131132
//! [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER]: https://util.unicode.org/UnicodeJsps/character.jsp?a=2D7F
132133
//!
133-
//! [`Canonical_Combining_Class`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G50313
134-
//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095
134+
//! [`Canonical_Combining_Class`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G50313
135+
//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40095
135136
//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
136137
//! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation
137-
//! [`General_Category`]: https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G124142
138+
//! [`General_Category`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-4/#G124142
138139
//! [`Grapheme_Extend=Prepend`]: https://www.unicode.org/reports/tr29/#Prepend
139-
//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
140-
//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593
140+
//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G52443
141+
//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G45593
141142
//! [`Joining_Group`]: https://www.unicode.org/versions/Unicode14.0.0/ch09.pdf#G36862
142-
//! [`Joining_Type`]: http://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G50009
143+
//! [`Joining_Type`]: http://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-9/#G50009
143144
//! [`Line_Break`]: https://www.unicode.org/reports/tr14/#LD5
144-
//! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode15.0.0/ch23.pdf#G37908
145+
//! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-23/#G37908
145146
//! [`Script`]: https://www.unicode.org/reports/tr24/#Script
146147
//!
147148
//! [`Fullwidth`]: https://www.unicode.org/reports/tr11/#ED2
@@ -150,7 +151,7 @@
150151
//!
151152
//! [`AI`]: https://www.unicode.org/reports/tr14/#AI
152153
//!
153-
//! [combining marks]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G30602
154+
//! [combining marks]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G30602
154155
//!
155156
//! [emoji ZWJ sequences]: https://www.unicode.org/reports/tr51/#def_emoji_sequence
156157
//! [Emoji modifier sequences]: https://www.unicode.org/reports/tr51/#def_emoji_modifier_sequence
@@ -159,13 +160,14 @@
159160
//!
160161
//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/nameslist/n_1F200.html
161162
//!
162-
//! [Arabic]: https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G7480
163-
//! [Buginese]: https://www.unicode.org/versions/Unicode15.0.0/ch17.pdf#G26743
164-
//! [Hebrew]: https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G6528
165-
//! [Khmer]: https://www.unicode.org/versions/Unicode15.0.0/ch16.pdf#G64642
166-
//! [Lisu]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G44587
167-
//! [Old Turkic]: https://www.unicode.org/versions/Unicode15.0.0/ch14.pdf#G41975
168-
//! [Tifinagh]: http://www.unicode.org/versions/Unicode15.0.0/ch19.pdf#G43184
163+
//! [Arabic]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-9/#G7480
164+
//! [Buginese]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-17/#G26743
165+
//! [Hebrew]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-9/#G6528
166+
//! [Khmer]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-16/#G64642
167+
//! [Kirat Rai]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-13/#G746409
168+
//! [Lisu]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-18/#G44587
169+
//! [Old Turkic]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-14/#G41975
170+
//! [Tifinagh]: http://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-19/#G43184
169171
//!
170172
//!
171173
//! ## Canonical equivalence

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy