Skip to content

Commit d0e111e

Browse files
Support Unicode 17
Add support for Unicode 17, Including VS3 variation sequences for Mongolian quotation marks (https://www.unicode.org/L2/L2025/25028-vs3-sibe-quotation-marks.pdf).
1 parent 7a7fcdc commit d0e111e

File tree

4 files changed

+196
-86
lines changed

4 files changed

+196
-86
lines changed

scripts/unicode.py

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
from itertools import batched
4444
from typing import Callable, Iterable
4545

46-
UNICODE_VERSION = "16.0.0"
46+
UNICODE_VERSION = "17.0.0"
4747
"""The version of the Unicode data files to download."""
4848

4949
NUM_CODEPOINTS = 0x110000
@@ -178,7 +178,9 @@ class WidthState(enum.IntEnum):
178178
(if set, should also set 3rd and 4th)
179179
- 6th bit: if 4th is set but this one is not, then this is a ZWJ ligature state
180180
where no ZWJ has been encountered yet; encountering one flips this on
181-
- Seventh bit: is VS1 (if CJK) or is VS2 (not CJK)
181+
- Seventh bit:
182+
- CJK mode: is VS1 or VS3
183+
- Not CJK: is VS2
182184
"""
183185

184186
# BASIC WIDTHS
@@ -275,8 +277,8 @@ class WidthState(enum.IntEnum):
275277

276278
# VARIATION SELECTORS
277279

278-
VARIATION_SELECTOR_1_OR_2 = 0b0000_0010_0000_0000
279-
"\\uFE00 if CJK, or \\uFE01 otherwise"
280+
VARIATION_SELECTOR_1_2_OR_3 = 0b0000_0010_0000_0000
281+
"\\uFE00 or \\uFE02 if CJK, or \\uFE01 otherwise"
280282

281283
# Text presentation sequences (not CJK)
282284
VARIATION_SELECTOR_15 = 0b0100_0000_0000_0000
@@ -373,7 +375,7 @@ def width_alone(self) -> int:
373375
| WidthState.COMBINING_LONG_SOLIDUS_OVERLAY
374376
| WidthState.VARIATION_SELECTOR_15
375377
| WidthState.VARIATION_SELECTOR_16
376-
| WidthState.VARIATION_SELECTOR_1_OR_2
378+
| WidthState.VARIATION_SELECTOR_1_2_OR_3
377379
):
378380
return 0
379381
case (
@@ -657,11 +659,12 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
657659
ea[cp] = width
658660

659661
# East-Asian only
660-
ea[0xFE00] = WidthState.VARIATION_SELECTOR_1_OR_2
661662
ea[0x0338] = WidthState.COMBINING_LONG_SOLIDUS_OVERLAY
663+
ea[0xFE00] = WidthState.VARIATION_SELECTOR_1_2_OR_3
664+
ea[0xFE02] = WidthState.VARIATION_SELECTOR_1_2_OR_3
662665

663666
# Not East Asian only
664-
not_ea[0xFE01] = WidthState.VARIATION_SELECTOR_1_OR_2
667+
not_ea[0xFE01] = WidthState.VARIATION_SELECTOR_1_2_OR_3
665668
not_ea[0xFE0E] = WidthState.VARIATION_SELECTOR_15
666669

667670
return (not_ea, ea)
@@ -759,7 +762,7 @@ def load_solidus_transparent(
759762
num_chars = len(ccc_above_1)
760763

761764
for cp in ccc_above_1:
762-
if cp not in [0xFE00, 0xFE0F]:
765+
if cp not in [0xFE00, 0xFE02, 0xFE0F]:
763766
assert (
764767
cjk_width_map[cp].table_width() != CharWidthInTable.SPECIAL
765768
), f"U+{cp:X}"
@@ -1317,14 +1320,14 @@ def lookup_fns(
13171320

13181321
if is_cjk:
13191322
s += """
1320-
if c == '\\u{FE00}' {
1321-
return (0, next_info.set_vs1_2());
1323+
if matches!(c, '\\u{FE00}' | '\\u{FE02}') {
1324+
return (0, next_info.set_vs1_2_3());
13221325
}
13231326
"""
13241327
else:
13251328
s += """
13261329
if c == '\\u{FE01}' {
1327-
return (0, next_info.set_vs1_2());
1330+
return (0, next_info.set_vs1_2_3());
13281331
}
13291332
if c == '\\u{FE0E}' {
13301333
return (0, next_info.set_text_presentation());
@@ -1345,7 +1348,7 @@ def lookup_fns(
13451348

13461349
s += """, WidthInfo::DEFAULT);
13471350
} else {
1348-
next_info = next_info.unset_vs1_2();
1351+
next_info = next_info.unset_vs1_2_3();
13491352
}
13501353
}
13511354
if next_info.is_ligature_transparent() {
@@ -1655,7 +1658,7 @@ def emit_module(
16551658
self.0
16561659
| WidthInfo::VARIATION_SELECTOR_16.0
16571660
& !WidthInfo::VARIATION_SELECTOR_15.0
1658-
& !WidthInfo::VARIATION_SELECTOR_1_OR_2.0,
1661+
& !WidthInfo::VARIATION_SELECTOR_1_2_OR_3.0,
16591662
)
16601663
}} else {{
16611664
Self::VARIATION_SELECTOR_16
@@ -1683,7 +1686,7 @@ def emit_module(
16831686
self.0
16841687
| WidthInfo::VARIATION_SELECTOR_15.0
16851688
& !WidthInfo::VARIATION_SELECTOR_16.0
1686-
& !WidthInfo::VARIATION_SELECTOR_1_OR_2.0,
1689+
& !WidthInfo::VARIATION_SELECTOR_1_2_OR_3.0,
16871690
)
16881691
}} else {{
16891692
Self(WidthInfo::VARIATION_SELECTOR_15.0)
@@ -1697,26 +1700,26 @@ def emit_module(
16971700
16981701
/// Has 7th bit set
16991702
fn is_vs1_2(self) -> bool {{
1700-
(self.0 & WidthInfo::VARIATION_SELECTOR_1_OR_2.0) == WidthInfo::VARIATION_SELECTOR_1_OR_2.0
1703+
(self.0 & WidthInfo::VARIATION_SELECTOR_1_2_OR_3.0) == WidthInfo::VARIATION_SELECTOR_1_2_OR_3.0
17011704
}}
17021705
17031706
/// Set 7th bit
1704-
fn set_vs1_2(self) -> Self {{
1707+
fn set_vs1_2_3(self) -> Self {{
17051708
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
17061709
Self(
17071710
self.0
1708-
| WidthInfo::VARIATION_SELECTOR_1_OR_2.0
1711+
| WidthInfo::VARIATION_SELECTOR_1_2_OR_3.0
17091712
& !WidthInfo::VARIATION_SELECTOR_15.0
17101713
& !WidthInfo::VARIATION_SELECTOR_16.0,
17111714
)
17121715
}} else {{
1713-
Self(WidthInfo::VARIATION_SELECTOR_1_OR_2.0)
1716+
Self(WidthInfo::VARIATION_SELECTOR_1_2_OR_3.0)
17141717
}}
17151718
}}
17161719
17171720
/// Clear 7th bit
1718-
fn unset_vs1_2(self) -> Self {{
1719-
Self(self.0 & !WidthInfo::VARIATION_SELECTOR_1_OR_2.0)
1721+
fn unset_vs1_2_3(self) -> Self {{
1722+
Self(self.0 & !WidthInfo::VARIATION_SELECTOR_1_2_OR_3.0)
17201723
}}
17211724
}}
17221725

src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@
6262
//! - Outside of an East Asian context, [text presentation sequences] have width 1 if their base character:
6363
//! - Has the [`Emoji_Presentation`] property, and
6464
//! - Is not in the [Enclosed Ideographic Supplement] block.
65-
//! - [`'\u{2018}'`, `'\u{2019}'`, `'\u{201C}'`, and `'\u{201D}'`][General Punctuation] always have width 1 when followed by '\u{FE00}',
66-
//! and width 2 when followed by '\u{FE01}'.
65+
//! - [`'\u{2018}'`, `'\u{2019}'`, `'\u{201C}'`, and `'\u{201D}'`][General Punctuation] always have width 1
66+
//! when followed by '\u{FE00}' or '\u{FE02}', and width 2 when followed by '\u{FE01}'.
6767
//! - Script-specific ligatures:
6868
//! - For all the following ligatures, the insertion of any number of [default-ignorable][`Default_Ignorable_Code_Point`]
6969
//! [combining marks] anywhere in the sequence will not change the total width. In addition, for all non-Arabic

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy