diff --git a/scripts/unicode.py b/scripts/unicode.py index aa0d86b..e7a0b71 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -15,14 +15,19 @@ # - DerivedCoreProperties.txt # - EastAsianWidth.txt # - HangulSyllableType.txt +# - LineBreak.txt # - NormalizationTest.txt (for tests only) # - PropList.txt # - ReadMe.txt # - UnicodeData.txt # - auxiliary/GraphemeBreakProperty.txt # - emoji/emoji-data.txt +# - emoji/emoji-test.txt (for tests only) # - emoji/emoji-variation-sequences.txt +# - extracted/DerivedCombiningClass.txt # - extracted/DerivedGeneralCategory.txt +# - extracted/DerivedJoiningGroup.txt +# - extracted/DerivedJoiningType.txt # # Since this should not require frequent updates, we just store this # out-of-line and check the generated module into git. @@ -429,6 +434,13 @@ def load_east_asian_widths() -> list[EastAsianWidth]: # Catch any leftover codepoints and assign them implicit Neutral/narrow width. width_map.append(EastAsianWidth.NARROW) + # Characters with ambiguous line breaking are ambiguous + load_property( + "LineBreak.txt", + "AI", + lambda cp: (operator.setitem(width_map, cp, EastAsianWidth.AMBIGUOUS)), + ) + # Ambiguous `Letter`s and `Modifier_Symbol`s are narrow load_property( "extracted/DerivedGeneralCategory.txt", diff --git a/src/lib.rs b/src/lib.rs index 71b5d70..2b1c4d1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -119,9 +119,11 @@ //! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D) //! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2. //! 6. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise: -//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or -//! has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or -//! is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and +//! - Fulfills one of the following conditions: +//! - Has an [`East_Asian_Width`] of [`Ambiguous`], or +//! - Has a [`Line_Break`] of [`AI`], or +//! - Has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or +//! - Is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387); and //! - Does not have a [`General_Category`] of `Letter` or `Modifier_Symbol`. //! 7. All other characters have width 1. //! @@ -138,6 +140,7 @@ //! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G45593 //! [`Joining_Group`]: https://www.unicode.org/versions/Unicode14.0.0/ch09.pdf#G36862 //! [`Joining_Type`]: http://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G50009 +//! [`Line_Break`]: https://www.unicode.org/reports/tr14/#LD5 //! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode15.0.0/ch23.pdf#G37908 //! [`Script`]: https://www.unicode.org/reports/tr24/#Script //! @@ -145,6 +148,8 @@ //! [`Wide`]: https://www.unicode.org/reports/tr11/#ED4 //! [`Ambiguous`]: https://www.unicode.org/reports/tr11/#ED6 //! +//! [`AI`]: https://www.unicode.org/reports/tr14/#AI +//! //! [combining marks]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G30602 //! //! [emoji ZWJ sequences]: https://www.unicode.org/reports/tr51/#def_emoji_sequence diff --git a/src/tables.rs b/src/tables.rs index fa632d6..c9c017b 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -1030,8 +1030,8 @@ static WIDTH_MIDDLE: Align64<[[u8; 64]; WIDTH_MIDDLE_LEN]> = Align64([ ], #[cfg(feature = "cjk")] [ - 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0x2E, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, - 0x33, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0xAE, 0x02, 0x02, 0x35, 0x36, 0x37, 0x02, 0x38, + 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0x2E, 0xA7, 0x39, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, + 0xAD, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0xAE, 0x02, 0x02, 0x35, 0x36, 0x37, 0x02, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0xAF, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, @@ -1878,7 +1878,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ #[cfg(feature = "cjk")] [ 0x95, 0x59, 0x59, 0x55, 0x55, 0x65, 0x55, 0x55, 0x69, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, - 0x55, 0x55, 0x55, 0x55, 0x55, 0x95, 0x56, 0x95, 0x6A, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA, + 0x55, 0x55, 0x55, 0x55, 0x55, 0x95, 0x5A, 0x95, 0x6A, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA, 0x5A, 0x55, ], #[cfg(feature = "cjk")] @@ -1914,13 +1914,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ #[cfg(feature = "cjk")] [ 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, - 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x9A, 0xAA, 0xAA, 0xAA, - 0xAA, 0xAA, - ], - #[cfg(feature = "cjk")] - [ - 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, - 0xAA, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x55, + 0xAA, 0xAA, 0xAA, 0xAA, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x56, 0x55, 0x55, ], #[cfg(feature = "cjk")] @@ -1931,7 +1925,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ ], #[cfg(feature = "cjk")] [ - 0x55, 0x69, 0x59, 0xA5, 0x55, 0x5F, 0x55, 0x66, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x55, 0x69, 0x59, 0xA5, 0x55, 0xAF, 0x55, 0x66, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x66, 0x55, 0xFF, 0xFF, 0xFF, 0x55, 0x55, 0x55, 0x9A, 0x9A, 0x6A, 0x9A, 0x55, 0x55, 0x55, 0xD5, ], @@ -1948,6 +1942,12 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ 0xAA, 0xAA, ], #[cfg(feature = "cjk")] + [ + 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xFD, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x57, 0x55, 0x55, + 0xD5, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x55, 0x55, + ], + #[cfg(feature = "cjk")] [ 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0xD5, 0x57, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x57, 0xAD, 0x5A, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, @@ -1973,7 +1973,7 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ ], #[cfg(feature = "cjk")] [ - 0xAA, 0xAA, 0x6A, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A, 0xAA, 0xAA, 0xAA, + 0xAA, 0xAA, 0xAA, 0x56, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, ], diff --git a/tests/tests.rs b/tests/tests.rs index 2940df2..7dc5b61 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -588,6 +588,13 @@ fn emoji_test_file() { } } +#[test] +fn ambiguous_line_break() { + assert_width!("\u{24EA}", 1, 2); + assert_width!("\u{2616}", 1, 2); + assert_width!("\u{2780}", 1, 2); +} + // Test traits are unsealed #[cfg(feature = "cjk")] pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy