unicode_normalization/
lib.rs

1// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11//! Unicode character composition and decomposition utilities
12//! as described in
13//! [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
14//!
15//! ```rust
16//! extern crate unicode_normalization;
17//!
18//! use unicode_normalization::char::compose;
19//! use unicode_normalization::UnicodeNormalization;
20//!
21//! fn main() {
22//!     assert_eq!(compose('A','\u{30a}'), Some('Å'));
23//!
24//!     let s = "ÅΩ";
25//!     let c = s.nfc().collect::<String>();
26//!     assert_eq!(c, "ÅΩ");
27//! }
28//! ```
29//!
30//! # crates.io
31//!
32//! You can use this package in your project by adding the following
33//! to your `Cargo.toml`:
34//!
35//! ```toml
36//! [dependencies]
37//! unicode-normalization = "0.1.20"
38//! ```
39
40#![deny(missing_docs, unsafe_code)]
41#![doc(
42    html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
43    html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
44)]
45#![cfg_attr(not(feature = "std"), no_std)]
46
47#[cfg(not(feature = "std"))]
48extern crate alloc;
49
50#[cfg(feature = "std")]
51extern crate core;
52
53extern crate tinyvec;
54
55pub use crate::decompose::Decompositions;
56pub use crate::quick_check::{
57    is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick,
58    is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick,
59    IsNormalized,
60};
61pub use crate::recompose::Recompositions;
62pub use crate::replace::Replacements;
63pub use crate::stream_safe::StreamSafe;
64pub use crate::tables::UNICODE_VERSION;
65use core::{option, str::Chars};
66
67mod decompose;
68mod lookups;
69mod normalize;
70mod perfect_hash;
71mod quick_check;
72mod recompose;
73mod replace;
74mod stream_safe;
75mod tables;
76
77#[doc(hidden)]
78pub mod __test_api;
79#[cfg(test)]
80mod test;
81
82/// Methods for composing and decomposing characters.
83pub mod char {
84    pub use crate::normalize::{
85        compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible,
86    };
87
88    pub use crate::lookups::{canonical_combining_class, is_combining_mark};
89
90    /// Return whether the given character is assigned (`General_Category` != `Unassigned`)
91    /// and not Private-Use (`General_Category` != `Private_Use`), in the supported version
92    /// of Unicode.
93    pub use crate::tables::is_public_assigned;
94}
95
96/// Methods for iterating over strings while applying Unicode normalizations
97/// as described in
98/// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
99pub trait UnicodeNormalization<I: Iterator<Item = char>> {
100    /// Returns an iterator over the string in Unicode Normalization Form D
101    /// (canonical decomposition).
102    fn nfd(self) -> Decompositions<I>;
103
104    /// Returns an iterator over the string in Unicode Normalization Form KD
105    /// (compatibility decomposition).
106    fn nfkd(self) -> Decompositions<I>;
107
108    /// An Iterator over the string in Unicode Normalization Form C
109    /// (canonical decomposition followed by canonical composition).
110    fn nfc(self) -> Recompositions<I>;
111
112    /// An Iterator over the string in Unicode Normalization Form KC
113    /// (compatibility decomposition followed by canonical composition).
114    fn nfkc(self) -> Recompositions<I>;
115
116    /// A transformation which replaces CJK Compatibility Ideograph codepoints
117    /// with normal forms using Standardized Variation Sequences. This is not
118    /// part of the canonical or compatibility decomposition algorithms, but
119    /// performing it before those algorithms produces normalized output which
120    /// better preserves the intent of the original text.
121    ///
122    /// Note that many systems today ignore variation selectors, so these
123    /// may not immediately help text display as intended, but they at
124    /// least preserve the information in a standardized form, giving
125    /// implementations the option to recognize them.
126    fn cjk_compat_variants(self) -> Replacements<I>;
127
128    /// An Iterator over the string with Conjoining Grapheme Joiner characters
129    /// inserted according to the Stream-Safe Text Process (UAX15-D4)
130    fn stream_safe(self) -> StreamSafe<I>;
131}
132
133impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
134    #[inline]
135    fn nfd(self) -> Decompositions<Chars<'a>> {
136        Decompositions::new_canonical(self.chars())
137    }
138
139    #[inline]
140    fn nfkd(self) -> Decompositions<Chars<'a>> {
141        Decompositions::new_compatible(self.chars())
142    }
143
144    #[inline]
145    fn nfc(self) -> Recompositions<Chars<'a>> {
146        Recompositions::new_canonical(self.chars())
147    }
148
149    #[inline]
150    fn nfkc(self) -> Recompositions<Chars<'a>> {
151        Recompositions::new_compatible(self.chars())
152    }
153
154    #[inline]
155    fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
156        replace::new_cjk_compat_variants(self.chars())
157    }
158
159    #[inline]
160    fn stream_safe(self) -> StreamSafe<Chars<'a>> {
161        StreamSafe::new(self.chars())
162    }
163}
164
165impl UnicodeNormalization<option::IntoIter<char>> for char {
166    #[inline]
167    fn nfd(self) -> Decompositions<option::IntoIter<char>> {
168        Decompositions::new_canonical(Some(self).into_iter())
169    }
170
171    #[inline]
172    fn nfkd(self) -> Decompositions<option::IntoIter<char>> {
173        Decompositions::new_compatible(Some(self).into_iter())
174    }
175
176    #[inline]
177    fn nfc(self) -> Recompositions<option::IntoIter<char>> {
178        Recompositions::new_canonical(Some(self).into_iter())
179    }
180
181    #[inline]
182    fn nfkc(self) -> Recompositions<option::IntoIter<char>> {
183        Recompositions::new_compatible(Some(self).into_iter())
184    }
185
186    #[inline]
187    fn cjk_compat_variants(self) -> Replacements<option::IntoIter<char>> {
188        replace::new_cjk_compat_variants(Some(self).into_iter())
189    }
190
191    #[inline]
192    fn stream_safe(self) -> StreamSafe<option::IntoIter<char>> {
193        StreamSafe::new(Some(self).into_iter())
194    }
195}
196
197impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
198    #[inline]
199    fn nfd(self) -> Decompositions<I> {
200        Decompositions::new_canonical(self)
201    }
202
203    #[inline]
204    fn nfkd(self) -> Decompositions<I> {
205        Decompositions::new_compatible(self)
206    }
207
208    #[inline]
209    fn nfc(self) -> Recompositions<I> {
210        Recompositions::new_canonical(self)
211    }
212
213    #[inline]
214    fn nfkc(self) -> Recompositions<I> {
215        Recompositions::new_compatible(self)
216    }
217
218    #[inline]
219    fn cjk_compat_variants(self) -> Replacements<I> {
220        replace::new_cjk_compat_variants(self)
221    }
222
223    #[inline]
224    fn stream_safe(self) -> StreamSafe<I> {
225        StreamSafe::new(self)
226    }
227}
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy