Skip to content

Commit 714ddc5

Browse files
Assign width 3 to KHMER SIGN BEYYAL
See https://unicode.org/charts/nameslist/n_1780.html
1 parent e6ba907 commit 714ddc5

File tree

4 files changed

+33
-16
lines changed

4 files changed

+33
-16
lines changed

scripts/unicode.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,9 @@ class WidthState(enum.IntEnum):
184184
WIDE = 0x1_0002
185185
"Two columns wide."
186186

187+
THREE = 0x1_0003
188+
"Three columns wide."
189+
187190
# \r\n
188191
LINE_FEED = 0b0000_0000_0000_0001
189192
"\\n (CRLF has width 1)"
@@ -341,6 +344,10 @@ def table_width(self) -> CharWidthInTable:
341344
case _:
342345
return CharWidthInTable.SPECIAL
343346

347+
def is_carried(self) -> bool:
348+
"Whether this corresponds to a non-default `WidthInfo`."
349+
return int(self) <= 0xFFFF
350+
344351
def width_alone(self) -> int:
345352
"The width of a character with this type when it appears alone."
346353
match self:
@@ -357,6 +364,8 @@ def width_alone(self) -> int:
357364
| WidthState.EMOJI_PRESENTATION
358365
):
359366
return 2
367+
case WidthState.THREE:
368+
return 3
360369
case _:
361370
return 1
362371

@@ -598,6 +607,7 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]:
598607
(alef_joining, WidthState.JOINING_GROUP_ALEF),
599608
(range(0x1780, 0x17A3), WidthState.KHMER_COENG_ELIGIBLE_LETTER),
600609
([0x17A7, 0x17AB, 0x17AC, 0x17AF], WidthState.KHMER_COENG_ELIGIBLE_LETTER),
610+
([0x17D8], WidthState.THREE),
601611
([0x1A10], WidthState.BUGINESE_LETTER_YA),
602612
(range(0x2D31, 0x2D66), WidthState.TIFINAGH_CONSONANT),
603613
([0x2D6F], WidthState.TIFINAGH_CONSONANT),
@@ -1196,7 +1206,11 @@ def lookup_fns(
11961206
s += f" '\\u{{{lo:X}}}'"
11971207
if hi != lo:
11981208
s += f"..='\\u{{{hi:X}}}'"
1199-
s += f" => ({width.width_alone()}, WidthInfo::{str(width.name)}),\n"
1209+
if width.is_carried():
1210+
width_info = width.name
1211+
else:
1212+
width_info = "DEFAULT"
1213+
s += f" => ({width.width_alone()}, WidthInfo::{width_info}),\n"
12001214

12011215
s += f""" _ => (2, WidthInfo::EMOJI_PRESENTATION),
12021216
}}
@@ -1531,7 +1545,7 @@ def emit_module(
15311545
)
15321546

15331547
for variant in WidthState:
1534-
if variant.table_width() == CharWidthInTable.SPECIAL:
1548+
if variant.is_carried():
15351549
if variant.is_cjk_only():
15361550
module.write(' #[cfg(feature = "cjk")]\n')
15371551
module.write(
@@ -1925,7 +1939,7 @@ def emit_module(
19251939
test_width_variants = []
19261940
test_width_variants_cjk = []
19271941
for variant in WidthState:
1928-
if variant.table_width() == CharWidthInTable.SPECIAL:
1942+
if variant.is_carried():
19291943
if not variant.is_cjk_only():
19301944
test_width_variants.append(variant)
19311945
if not variant.is_non_cjk_only():
@@ -2003,10 +2017,7 @@ def emit_module(
20032017
)
20042018

20052019
for variant in WidthState:
2006-
if (
2007-
variant.table_width() == CharWidthInTable.SPECIAL
2008-
and not variant.is_cjk_only()
2009-
):
2020+
if variant.is_carried() and not variant.is_cjk_only():
20102021
module.write(f" WidthInfo::{variant.name},\n")
20112022

20122023
module.write(
@@ -2018,10 +2029,7 @@ def emit_module(
20182029
)
20192030

20202031
for variant in WidthState:
2021-
if (
2022-
variant.table_width() == CharWidthInTable.SPECIAL
2023-
and not variant.is_non_cjk_only()
2024-
):
2032+
if variant.is_carried() and not variant.is_non_cjk_only():
20252033
module.write(f" WidthInfo::{variant.name},\n")
20262034

20272035
module.write(

src/lib.rs

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,8 @@
8888
//! 2. In all other cases, the width of the string equals the sum of its character widths:
8989
//! 1. [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER] has width 1 (outside of the ligatures described previously).
9090
//! 2. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
91-
//! 3. The following have width 0:
91+
//! 3. [`'\u{17D8}'` KHMER SIGN BEYYAL](https://util.unicode.org/UnicodeJsps/character.jsp?a=17D8) has width 3.
92+
//! 4. The following have width 0:
9293
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
9394
//! with the [`Default_Ignorable_Code_Point`] property.
9495
//! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
@@ -111,15 +112,15 @@
111112
//! - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and
112113
//! - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2).
113114
//! - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA).
114-
//! 4. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
115+
//! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
115116
//! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
116-
//! 5. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
117+
//! 6. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
117118
//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or
118119
//! has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
119120
//! is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and
120121
//! - Does not have a [`General_Category`] of `Modifier_Symbol`, and
121122
//! - Does not have a [`Script`] of `Latin`, `Greek`, or `Cyrillic`, or is a Roman numeral in the range `'\u{2160}'..='\u{217F}'`.
122-
//! 6. All other characters have width 1.
123+
//! 7. All other characters have width 1.
123124
//!
124125
//! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338
125126
//! [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER]: https://util.unicode.org/UnicodeJsps/character.jsp?a=2D7F

src/tables.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ fn lookup_width(c: char) -> (u8, WidthInfo) {
161161
'\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED),
162162
'\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF),
163163
'\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER),
164+
'\u{17D8}' => (3, WidthInfo::DEFAULT),
164165
'\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA),
165166
'\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT),
166167
'\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU),
@@ -444,6 +445,7 @@ fn lookup_width_cjk(c: char) -> (u8, WidthInfo) {
444445
'\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED),
445446
'\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF),
446447
'\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER),
448+
'\u{17D8}' => (3, WidthInfo::DEFAULT),
447449
'\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA),
448450
'\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT),
449451
'\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU),
@@ -1220,7 +1222,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([
12201222
],
12211223
[
12221224
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x7F, 0xD5, 0xD5, 0xD7, 0x55, 0x10, 0x00,
1223-
0x50, 0x55, 0x45, 0x01, 0x00, 0x00, 0x55, 0x55, 0x51, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
1225+
0x50, 0x55, 0x45, 0x01, 0x00, 0x00, 0x55, 0x57, 0x51, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
12241226
0x55, 0x55,
12251227
],
12261228
[

tests/tests.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,12 @@ fn test_khmer_coeng() {
409409
}
410410
}
411411

412+
#[test]
413+
fn test_khmer_sign_beyyal() {
414+
assert_width!("៘", 3, 3);
415+
assert_width!("។ល។", 3, 3);
416+
}
417+
412418
#[test]
413419
fn test_emoji_modifier() {
414420
assert_width!("\u{1F46A}", 2, 2);

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy