unicode_width/
lib.rs

1// Copyright 2012-2025 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11//! Determine displayed width of `char` and `str` types according to
12//! [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
13//! and other portions of the Unicode standard.
14//! See the [Rules for determining width](#rules-for-determining-width) section
15//! for the exact rules.
16//!
17//! This crate is `#![no_std]`.
18//!
19//! ```rust
20//! use unicode_width::UnicodeWidthStr;
21//!
22//! let teststr = "Hello, world!";
23//! let width = UnicodeWidthStr::width(teststr);
24//! println!("{}", teststr);
25//! println!("The above string is {} columns wide.", width);
26//! ```
27//!
28//! # `"cjk"` feature flag
29//!
30//! This crate has one Cargo feature flag, `"cjk"`
31//! (enabled by default).
32//! It enables the [`UnicodeWidthChar::width_cjk`]
33//! and [`UnicodeWidthStr::width_cjk`],
34//! which perform an alternate width calculation
35//! more suited to CJK contexts. The flag also unseals the
36//! [`UnicodeWidthChar`] and [`UnicodeWidthStr`] traits.
37//!
38//! Disabling the flag (with `no_default_features` in `Cargo.toml`)
39//! will reduce the amount of static data needed by the crate.
40//!
41//! ```rust
42//! use unicode_width::UnicodeWidthStr;
43//!
44//! let teststr = "“𘀀”";
45//! assert_eq!(teststr.width(), 4);
46//!
47//! #[cfg(feature = "cjk")]
48//! assert_eq!(teststr.width_cjk(), 6);
49//! ```
50//!
51//! # Rules for determining width
52//!
53//! This crate currently uses the following rules to determine the width of a
54//! character or string, in order of decreasing precedence. These may be tweaked in the future.
55//!
56//! 1. In the following cases, the width of a string differs from the sum of the widths of its constituent characters:
57//!    - The sequence `"\r\n"` has width 1.
58//!    - Emoji-specific ligatures:
59//!      - Well-formed, fully-qualified [emoji ZWJ sequences] have width 2.
60//!      - [Emoji modifier sequences] have width 2.
61//!      - [Emoji presentation sequences] have width 2.
62//!      - Outside of an East Asian context, [text presentation sequences] have width 1 if their base character:
63//!        - Has the [`Emoji_Presentation`] property, and
64//!        - Is not in the [Enclosed Ideographic Supplement] block.
65//!    - [`'\u{2018}'`, `'\u{2019}'`, `'\u{201C}'`, and `'\u{201D}'`][General Punctuation] always have width 1 when followed by '\u{FE00}',
66//!      and width 2 when followed by '\u{FE01}'.
67//!    - Script-specific ligatures:
68//!      - For all the following ligatures, the insertion of any number of [default-ignorable][`Default_Ignorable_Code_Point`]
69//!        [combining marks] anywhere in the sequence will not change the total width. In addition, for all non-Arabic
70//!        ligatures, the insertion of any number of [`'\u{200D}'` ZERO WIDTH JOINER](https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-23/#G23126)s
71//!        will not affect the width.
72//!      - **[Arabic]**: A character sequence consisting of one character with [`Joining_Group`]`=Lam`,
73//!        followed by any number of characters with [`Joining_Type`]`=Transparent`, followed by one character
74//!        with [`Joining_Group`]`=Alef`, has total width 1. For example: `لا`‎, `لآ`‎, `ڸا`‎, `لٟٞأ`
75//!      - **[Buginese]**: `"\u{1A15}\u{1A17}\u{200D}\u{1A10}"` (<a, -i> ya, `ᨕᨗ‍ᨐ`) has total width 1.
76//!      - **[Hebrew]**: `"א\u{200D}ל"` (Alef-Lamed, `א‍ל`) has total width 1.
77//!      - **[Khmer]**: Coeng signs consisting of `'\u{17D2}'` followed by a character in
78//!        `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}'  | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'`
79//!        have width 0.
80//!      - **[Kirat Rai]**: Any sequence canonically equivalent to `'\u{16D68}'`, `'\u{16D69}'`, or `'\u{16D6A}'` has total width 1.
81//!      - **[Lisu]**: Tone letter combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'`
82//!        followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. For example: `ꓹꓼ`
83//!      - **[Old Turkic]**: `"\u{10C32}\u{200D}\u{10C03}"` (`𐰲‍𐰃`) has total width 1.
84//!      - **[Tifinagh]**: A sequence of a Tifinagh consonant in the range `'\u{2D31}'..='\u{2D65}' | '\u{2D6F}'`, followed by either
85//!        [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER] or `'\u{200D}'`, followed by another Tifinangh consonant, has total width 1.
86//!        For example: `ⵏ⵿ⴾ`
87//!    - In an East Asian context only, `<`, `=`, or `>` have width 2 when followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY].
88//!      The two characters may be separated by any number of characters whose canonical decompositions consist only of characters meeting
89//!      one of the following requirements:
90//!      - Has [`Canonical_Combining_Class`] greater than 1, or
91//!      - Is a [default-ignorable][`Default_Ignorable_Code_Point`] [combining mark][combining marks].
92//! 2. In all other cases, the width of the string equals the sum of its character widths:
93//!    1. [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER] has width 1 (outside of the ligatures described previously).
94//!    2. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) and
95//!       [`'\u{17A4}'` KHMER INDEPENDENT VOWEL QAA](https://util.unicode.org/UnicodeJsps/character.jsp?a=17A4) have width 2.
96//!    3. [`'\u{17D8}'` KHMER SIGN BEYYAL](https://util.unicode.org/UnicodeJsps/character.jsp?a=17D8) has width 3.
97//!    4. The following have width 0:
98//!       - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
99//!         with the [`Default_Ignorable_Code_Point`] property.
100//!       - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
101//!         with the [`Grapheme_Extend`] property.
102//!       - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BHangul_Syllable_Type%3DV%7D%5Cp%7BHangul_Syllable_Type%3DT%7D)
103//!         with a [`Hangul_Syllable_Type`] of `Vowel_Jamo` (`V`) or `Trailing_Jamo` (`T`).
104//!       - The following [`Prepended_Concatenation_Mark`]s:
105//!         - [`'\u{0605}'` NUMBER MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0605),
106//!         - [`'\u{070F}'` SYRIAC ABBREVIATION MARK](https://util.unicode.org/UnicodeJsps/character.jsp?a=070F),
107//!         - [`'\u{0890}'` POUND MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0890),
108//!         - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and
109//!         - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2).
110//!       - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Cluster_Break%3DPrepend%7D-%5Cp%7BPrepended_Concatenation_Mark%7D)
111//!         with the [`Grapheme_Extend=Prepend`] property, that are not also [`Prepended_Concatenation_Mark`]s.
112//!       - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA).
113//!    5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D)
114//!       with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2.
115//!    6. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise:
116//!       - Fulfills one of the following conditions:
117//!         - Has an [`East_Asian_Width`] of [`Ambiguous`], or
118//!         - Has a [`Line_Break`] of [`AI`], or
119//!         - Has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or
120//!         - Is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387); and
121//!       - Does not have a [`General_Category`] of `Letter` or `Modifier_Symbol`.
122//!    7. All other characters have width 1.
123//!
124//! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338
125//! [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER]: https://util.unicode.org/UnicodeJsps/character.jsp?a=2D7F
126//!
127//! [`Canonical_Combining_Class`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G50313
128//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40095
129//! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
130//! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation
131//! [`General_Category`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-4/#G124142
132//! [`Grapheme_Extend=Prepend`]: https://www.unicode.org/reports/tr29/#Prepend
133//! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G52443
134//! [`Hangul_Syllable_Type`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G45593
135//! [`Joining_Group`]: https://www.unicode.org/versions/Unicode14.0.0/ch09.pdf#G36862
136//! [`Joining_Type`]: http://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-9/#G50009
137//! [`Line_Break`]: https://www.unicode.org/reports/tr14/#LD5
138//! [`Prepended_Concatenation_Mark`]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-23/#G37908
139//! [`Script`]: https://www.unicode.org/reports/tr24/#Script
140//!
141//! [`Fullwidth`]: https://www.unicode.org/reports/tr11/#ED2
142//! [`Wide`]: https://www.unicode.org/reports/tr11/#ED4
143//! [`Ambiguous`]: https://www.unicode.org/reports/tr11/#ED6
144//!
145//! [`AI`]: https://www.unicode.org/reports/tr14/#AI
146//!
147//! [combining marks]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G30602
148//!
149//! [emoji ZWJ sequences]: https://www.unicode.org/reports/tr51/#def_emoji_sequence
150//! [Emoji modifier sequences]: https://www.unicode.org/reports/tr51/#def_emoji_modifier_sequence
151//! [Emoji presentation sequences]: https://unicode.org/reports/tr51/#def_emoji_presentation_sequence
152//! [text presentation sequences]: https://unicode.org/reports/tr51/#def_text_presentation_sequence
153//!
154//! [General Punctuation]: https://www.unicode.org/charts/PDF/Unicode-16.0/U160-2000.pdf
155//! [Enclosed Ideographic Supplement]: https://unicode.org/charts/nameslist/n_1F200.html
156//!
157//! [Arabic]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-9/#G7480
158//! [Buginese]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-17/#G26743
159//! [Hebrew]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-9/#G6528
160//! [Khmer]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-16/#G64642
161//! [Kirat Rai]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-13/#G746409
162//! [Lisu]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-18/#G44587
163//! [Old Turkic]: https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-14/#G41975
164//! [Tifinagh]: http://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-19/#G43184
165//!
166//!
167//! ## Canonical equivalence
168//!
169//! Canonically equivalent strings are assigned the same width (CJK and non-CJK).
170
171#![forbid(unsafe_code)]
172#![deny(missing_docs)]
173#![doc(
174    html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
175    html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
176)]
177#![no_std]
178
179pub use tables::UNICODE_VERSION;
180
181mod tables;
182
183mod private {
184    pub trait Sealed {}
185    #[cfg(not(feature = "cjk"))]
186    impl Sealed for char {}
187    #[cfg(not(feature = "cjk"))]
188    impl Sealed for str {}
189    #[cfg(feature = "cjk")]
190    impl<T: ?Sized> Sealed for T {}
191}
192
193/// Methods for determining displayed width of Unicode characters.
194pub trait UnicodeWidthChar: private::Sealed {
195    /// Returns the character's displayed width in columns, or `None` if the
196    /// character is a control character.
197    ///
198    /// This function treats characters in the Ambiguous category according
199    /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
200    /// as 1 column wide. This is consistent with the recommendations for non-CJK
201    /// contexts, or when the context cannot be reliably determined.
202    fn width(self) -> Option<usize>;
203
204    /// Returns the character's displayed width in columns, or `None` if the
205    /// character is a control character.
206    ///
207    /// This function treats characters in the Ambiguous category according
208    /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
209    /// as 2 columns wide. This is consistent with the recommendations for
210    /// CJK contexts.
211    #[cfg(feature = "cjk")]
212    fn width_cjk(self) -> Option<usize>;
213}
214
215impl UnicodeWidthChar for char {
216    #[inline]
217    fn width(self) -> Option<usize> {
218        tables::single_char_width(self)
219    }
220
221    #[cfg(feature = "cjk")]
222    #[inline]
223    fn width_cjk(self) -> Option<usize> {
224        tables::single_char_width_cjk(self)
225    }
226}
227
228/// Methods for determining displayed width of Unicode strings.
229pub trait UnicodeWidthStr: private::Sealed {
230    /// Returns the string's displayed width in columns.
231    ///
232    /// This function treats characters in the Ambiguous category according
233    /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
234    /// as 1 column wide. This is consistent with the recommendations for
235    /// non-CJK contexts, or when the context cannot be reliably determined.
236    fn width(&self) -> usize;
237
238    /// Returns the string's displayed width in columns.
239    ///
240    /// This function treats characters in the Ambiguous category according
241    /// to [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
242    /// as 2 column wide. This is consistent with the recommendations for
243    /// CJK contexts.
244    #[cfg(feature = "cjk")]
245    fn width_cjk(&self) -> usize;
246}
247
248impl UnicodeWidthStr for str {
249    #[inline]
250    fn width(&self) -> usize {
251        tables::str_width(self)
252    }
253
254    #[cfg(feature = "cjk")]
255    #[inline]
256    fn width_cjk(&self) -> usize {
257        tables::str_width_cjk(self)
258    }
259}
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy