Skip to content

Commit f0c9f1b

Browse files
Support Unicode 16 variation seqs for quotation mark width
1 parent 6ab41d7 commit f0c9f1b

File tree

4 files changed

+196
-33
lines changed

4 files changed

+196
-33
lines changed

scripts/unicode.py

Lines changed: 80 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -175,8 +175,11 @@ class WidthState(enum.IntEnum):
175175
- 4th bit: whether to set top bit on emoji presentation.
176176
If this is set but 3rd is not, the width mode is related to zwj sequences
177177
- 5th from top: whether this is unaffected by ligature-transparent
178+
(if set, should also set 3rd and 4th)
178179
- 6th bit: if 4th is set but this one is not, then this is a ZWJ ligature state
179-
where no ZWJ has been encountered yet; encountering one flips this on"""
180+
where no ZWJ has been encountered yet; encountering one flips this on
181+
- Seventh bit: is VS1 (if CJK) or is VS2 (not CJK)
182+
"""
180183

181184
# BASIC WIDTHS
182185

@@ -272,6 +275,9 @@ class WidthState(enum.IntEnum):
272275

273276
# VARIATION SELECTORS
274277

278+
VARIATION_SELECTOR_1_OR_2 = 0b0000_0010_0000_0000
279+
"\\uFE00 if CJK, or \\uFE01 otherwise"
280+
275281
# Text presentation sequences (not CJK)
276282
VARIATION_SELECTOR_15 = 0b0100_0000_0000_0000
277283
"\\uFE0E (text presentation sequences)"
@@ -367,6 +373,7 @@ def width_alone(self) -> int:
367373
| WidthState.COMBINING_LONG_SOLIDUS_OVERLAY
368374
| WidthState.VARIATION_SELECTOR_15
369375
| WidthState.VARIATION_SELECTOR_16
376+
| WidthState.VARIATION_SELECTOR_1_OR_2
370377
):
371378
return 0
372379
case (
@@ -656,9 +663,11 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
656663
ea[cp] = width
657664

658665
# East-Asian only
666+
ea[0xFE00] = WidthState.VARIATION_SELECTOR_1_OR_2
659667
ea[0x0338] = WidthState.COMBINING_LONG_SOLIDUS_OVERLAY
660668

661669
# Not East Asian only
670+
not_ea[0xFE01] = WidthState.VARIATION_SELECTOR_1_OR_2
662671
not_ea[0xFE0E] = WidthState.VARIATION_SELECTOR_15
663672

664673
return (not_ea, ea)
@@ -724,7 +733,7 @@ def load_solidus_transparent(
724733
cjk_width_map: list[WidthState],
725734
) -> list[tuple[Codepoint, Codepoint]]:
726735
"""Characters expanding to a canonical combining class above 1, plus `ligature_transparent`s from above.
727-
Ranges matching ones in `ligature_transparent` exactly are excluded (for compression), so it needs to bechecked also.
736+
Ranges matching ones in `ligature_transparent` exactly are excluded (for compression), so it needs to be checked also.
728737
"""
729738

730739
ccc_above_1 = set()
@@ -756,7 +765,7 @@ def load_solidus_transparent(
756765
num_chars = len(ccc_above_1)
757766

758767
for cp in ccc_above_1:
759-
if cp != 0xFE0F:
768+
if cp not in [0xFE00, 0xFE0F]:
760769
assert (
761770
cjk_width_map[cp].table_width() != CharWidthInTable.SPECIAL
762771
), f"U+{cp:X}"
@@ -1312,8 +1321,17 @@ def lookup_fns(
13121321
return (0, next_info.set_emoji_presentation());
13131322
}"""
13141323

1315-
if not is_cjk:
1324+
if is_cjk:
1325+
s += """
1326+
if c == '\\u{FE00}' {
1327+
return (0, next_info.set_vs1_2());
1328+
}
1329+
"""
1330+
else:
13161331
s += """
1332+
if c == '\\u{FE01}' {
1333+
return (0, next_info.set_vs1_2());
1334+
}
13171335
if c == '\\u{FE0E}' {
13181336
return (0, next_info.set_text_presentation());
13191337
}
@@ -1323,9 +1341,19 @@ def lookup_fns(
13231341
} else {
13241342
next_info = next_info.unset_text_presentation();
13251343
}
1326-
}"""
1344+
} else """
13271345

1328-
s += """
1346+
s += """if next_info.is_vs1_2() {
1347+
if matches!(c, '\\u{2018}' | '\\u{2019}' | '\\u{201C}' | '\\u{201D}') {
1348+
return ("""
1349+
1350+
s += str(2 - is_cjk)
1351+
1352+
s += """, WidthInfo::DEFAULT);
1353+
} else {
1354+
next_info = next_info.unset_vs1_2();
1355+
}
1356+
}
13291357
if next_info.is_ligature_transparent() {
13301358
if c == '\\u{200D}' {
13311359
return (0, next_info.set_zwj_bit());
@@ -1586,6 +1614,8 @@ def emit_module(
15861614
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
15871615
struct WidthInfo(u16);
15881616
1617+
const LIGATURE_TRANSPARENT_MASK: u16 = 0b0010_0000_0000_0000;
1618+
15891619
impl WidthInfo {
15901620
/// No special handling necessary
15911621
const DEFAULT: Self = Self(0);
@@ -1615,51 +1645,84 @@ def emit_module(
16151645
16161646
/// Has top bit set
16171647
fn is_emoji_presentation(self) -> bool {{
1618-
(self.0 & 0b1000_0000_0000_0000) == 0b1000_0000_0000_0000
1648+
(self.0 & WidthInfo::VARIATION_SELECTOR_16.0) == WidthInfo::VARIATION_SELECTOR_16.0
16191649
}}
16201650
1621-
/// Has top bit set
16221651
fn is_zwj_emoji_presentation(self) -> bool {{
16231652
(self.0 & 0b1011_0000_0000_0000) == 0b1001_0000_0000_0000
16241653
}}
16251654
16261655
/// Set top bit
16271656
fn set_emoji_presentation(self) -> Self {{
1628-
if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000
1657+
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK
16291658
|| (self.0 & 0b1001_0000_0000_0000) == 0b0001_0000_0000_0000
16301659
{{
1631-
Self(self.0 | 0b1000_0000_0000_0000)
1660+
Self(
1661+
self.0
1662+
| WidthInfo::VARIATION_SELECTOR_16.0
1663+
& !WidthInfo::VARIATION_SELECTOR_15.0
1664+
& !WidthInfo::VARIATION_SELECTOR_1_OR_2.0,
1665+
)
16321666
}} else {{
16331667
Self::VARIATION_SELECTOR_16
16341668
}}
16351669
}}
16361670
16371671
/// Clear top bit
16381672
fn unset_emoji_presentation(self) -> Self {{
1639-
if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{
1640-
Self(self.0 & 0b0111_1111_1111_1111)
1673+
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
1674+
Self(self.0 & !WidthInfo::VARIATION_SELECTOR_16.0)
16411675
}} else {{
16421676
Self::DEFAULT
16431677
}}
16441678
}}
16451679
16461680
/// Has 2nd bit set
16471681
fn is_text_presentation(self) -> bool {{
1648-
(self.0 & 0b0100_0000_0000_0000) == 0b0100_0000_0000_0000
1682+
(self.0 & WidthInfo::VARIATION_SELECTOR_15.0) == WidthInfo::VARIATION_SELECTOR_15.0
16491683
}}
16501684
16511685
/// Set 2nd bit
16521686
fn set_text_presentation(self) -> Self {{
1653-
if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{
1654-
Self(self.0 | 0b0100_0000_0000_0000)
1687+
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
1688+
Self(
1689+
self.0
1690+
| WidthInfo::VARIATION_SELECTOR_15.0
1691+
& !WidthInfo::VARIATION_SELECTOR_16.0
1692+
& !WidthInfo::VARIATION_SELECTOR_1_OR_2.0,
1693+
)
16551694
}} else {{
1656-
Self(0b0100_0000_0000_0000)
1695+
Self(WidthInfo::VARIATION_SELECTOR_15.0)
16571696
}}
16581697
}}
16591698
16601699
/// Clear 2nd bit
16611700
fn unset_text_presentation(self) -> Self {{
1662-
Self(self.0 & 0b1011_1111_1111_1111)
1701+
Self(self.0 & !WidthInfo::VARIATION_SELECTOR_15.0)
1702+
}}
1703+
1704+
/// Has 7th bit set
1705+
fn is_vs1_2(self) -> bool {{
1706+
(self.0 & WidthInfo::VARIATION_SELECTOR_1_OR_2.0) == WidthInfo::VARIATION_SELECTOR_1_OR_2.0
1707+
}}
1708+
1709+
/// Set 7th bit
1710+
fn set_vs1_2(self) -> Self {{
1711+
if (self.0 & LIGATURE_TRANSPARENT_MASK) == LIGATURE_TRANSPARENT_MASK {{
1712+
Self(
1713+
self.0
1714+
| WidthInfo::VARIATION_SELECTOR_1_OR_2.0
1715+
& !WidthInfo::VARIATION_SELECTOR_15.0
1716+
& !WidthInfo::VARIATION_SELECTOR_16.0,
1717+
)
1718+
}} else {{
1719+
Self(WidthInfo::VARIATION_SELECTOR_1_OR_2.0)
1720+
}}
1721+
}}
1722+
1723+
/// Clear 7th bit
1724+
fn unset_vs1_2(self) -> Self {{
1725+
Self(self.0 & !WidthInfo::VARIATION_SELECTOR_1_OR_2.0)
16631726
}}
16641727
}}
16651728

src/lib.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@
6262
//! - Outside of an East Asian context, [text presentation sequences] have width 1 if their base character:
6363
//! - Has the [`Emoji_Presentation`] property, and
6464
//! - Is not in the [Enclosed Ideographic Supplement] block.
65+
//! - [`'\u{2018}'`, `'\u{2019}'`, `'\u{201C}'`, and `'\u{201D}'`][General Punctuation] always have width 1 when followed by '\u{FE00}',
66+
//! and width 2 when followed by '\u{FE01}'.
6567
//! - Script-specific ligatures:
6668
//! - For all the following ligatures, the insertion of any number of [default-ignorable][`Default_Ignorable_Code_Point`]
6769
//! [combining marks] anywhere in the sequence will not change the total width. In addition, for all non-Arabic
@@ -75,7 +77,7 @@
7577
//! - **[Khmer]**: Coeng signs consisting of `'\u{17D2}'` followed by a character in
7678
//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
7779
//! have width 0.
78-
//! - **[Kirat Rai]**: Any sequence canonically equivalent to `\u{16D68}`, `\u{16D69}`, or `\u{16D6A}` has total width 1.
80+
//! - **[Kirat Rai]**: Any sequence canonically equivalent to `'\u{16D68}'`, `'\u{16D69}'`, or `'\u{16D6A}'` has total width 1.
7981
//! - **[Lisu]**: Tone letter combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
8082
//! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. For example: `ꓹꓼ`
8183
//! - **[Old Turkic]**: `"\u{10C32}\u{200D}\u{10C03}"` (`𐰲‍𐰃`) has total width 1.
@@ -158,6 +160,7 @@
158160
//! [Emoji presentation sequences]: https://unicode.org/reports/tr51/#def_emoji_presentation_sequence
159161
//! [text presentation sequences]: https://unicode.org/reports/tr51/#def_text_presentation_sequence
160162
//!
163+
//! [General Punctuation]: https://www.unicode.org/charts/PDF/Unicode-16.0/U160-2000.pdf
161164
//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/nameslist/n_1F200.html
162165
//!
163166
//! [Arabic]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-9/#G7480

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy