unicode_normalization/
stream_safe.rs

1use core::iter::FusedIterator;
2
3use crate::lookups::{
4    canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
5    stream_safe_trailing_nonstarters,
6};
7use crate::normalize::{hangul_decomposition_length, is_hangul_syllable};
8use crate::tables::stream_safe_leading_nonstarters;
9
10pub(crate) const MAX_NONSTARTERS: usize = 30;
11const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
12
13/// UAX15-D4: This iterator keeps track of how many non-starters there have been
14/// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
15/// (U+034F) if the count exceeds 30.
16pub struct StreamSafe<I> {
17    iter: I,
18    nonstarter_count: usize,
19    buffer: Option<char>,
20}
21
22impl<I> StreamSafe<I> {
23    pub(crate) fn new(iter: I) -> Self {
24        Self {
25            iter,
26            nonstarter_count: 0,
27            buffer: None,
28        }
29    }
30}
31
32impl<I: Iterator<Item = char>> Iterator for StreamSafe<I> {
33    type Item = char;
34
35    #[inline]
36    fn next(&mut self) -> Option<char> {
37        let next_ch = match self.buffer.take().or_else(|| self.iter.next()) {
38            None => return None,
39            Some(c) => c,
40        };
41        let d = classify_nonstarters(next_ch);
42        if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS {
43            // Since we're emitting a CGJ, the suffix of the emitted string in NFKD has no trailing
44            // nonstarters, so we can reset the counter to zero. Put `next_ch` back into the
45            // iterator (via `self.buffer`), and we'll reclassify it next iteration.
46            self.nonstarter_count = 0;
47            self.buffer = Some(next_ch);
48            return Some(COMBINING_GRAPHEME_JOINER);
49        }
50
51        // Is the character all nonstarters in NFKD? If so, increment our counter of contiguous
52        // nonstarters in NKFD.
53        if d.leading_nonstarters == d.decomposition_len {
54            self.nonstarter_count += d.decomposition_len;
55        }
56        // Otherwise, reset the counter to the decomposition's number of trailing nonstarters.
57        else {
58            self.nonstarter_count = d.trailing_nonstarters;
59        }
60        Some(next_ch)
61    }
62}
63
64impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for StreamSafe<I> {}
65
66#[derive(Debug)]
67pub(crate) struct Decomposition {
68    pub(crate) leading_nonstarters: usize,
69    pub(crate) trailing_nonstarters: usize,
70    pub(crate) decomposition_len: usize,
71}
72
73#[inline]
74pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
75    // As usual, fast path for ASCII (which is always a starter)
76    if c <= '\x7f' {
77        return Decomposition {
78            leading_nonstarters: 0,
79            trailing_nonstarters: 0,
80            decomposition_len: 1,
81        };
82    }
83    // Next, special case Hangul, since it's not handled by our tables.
84    if is_hangul_syllable(c) {
85        return Decomposition {
86            leading_nonstarters: 0,
87            trailing_nonstarters: 0,
88            decomposition_len: hangul_decomposition_length(c),
89        };
90    }
91    let decomp = compatibility_fully_decomposed(c).or_else(|| canonical_fully_decomposed(c));
92    match decomp {
93        Some(decomp) => Decomposition {
94            leading_nonstarters: stream_safe_leading_nonstarters(c),
95            trailing_nonstarters: stream_safe_trailing_nonstarters(c),
96            decomposition_len: decomp.len(),
97        },
98        None => {
99            let is_nonstarter = canonical_combining_class(c) != 0;
100            let nonstarter = if is_nonstarter { 1 } else { 0 };
101            Decomposition {
102                leading_nonstarters: nonstarter,
103                trailing_nonstarters: nonstarter,
104                decomposition_len: 1,
105            }
106        }
107    }
108}
109
110#[cfg(test)]
111mod tests {
112    use super::{classify_nonstarters, StreamSafe};
113    use crate::lookups::canonical_combining_class;
114    use crate::normalize::decompose_compatible;
115
116    #[cfg(not(feature = "std"))]
117    use alloc::{string::String, vec::Vec};
118
119    use core::char;
120
121    fn stream_safe(s: &str) -> String {
122        StreamSafe::new(s.chars()).collect()
123    }
124
125    #[test]
126    fn test_simple() {
127        let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone";
128        assert_eq!(stream_safe(technically_okay), technically_okay);
129
130        let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
131        let fixed_it = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}ngerzone";
132        assert_eq!(stream_safe(too_much), fixed_it);
133
134        let woah_nelly = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
135        let its_cool = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{034f}\u{032e}\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{034f}\u{031d}\u{032e}ngerzone";
136        assert_eq!(stream_safe(woah_nelly), its_cool);
137    }
138
139    #[test]
140    fn test_all_nonstarters() {
141        let s = "\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}\u{0300}";
142        let expected = "\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{034F}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}\u{300}";
143        assert_eq!(stream_safe(s), expected);
144    }
145
146    #[test]
147    fn test_classify_nonstarters() {
148        // Highest character in the `compat_fully_decomp` table is 2FA1D
149        for ch in 0..0x2FA1E {
150            let ch = match char::from_u32(ch) {
151                Some(c) => c,
152                None => continue,
153            };
154            let c = classify_nonstarters(ch);
155            let mut s = Vec::new();
156            decompose_compatible(ch, |c| s.push(c));
157
158            assert_eq!(s.len(), c.decomposition_len);
159
160            let num_leading = s
161                .iter()
162                .take_while(|&c| canonical_combining_class(*c) != 0)
163                .count();
164            let num_trailing = s
165                .iter()
166                .rev()
167                .take_while(|&c| canonical_combining_class(*c) != 0)
168                .count();
169
170            assert_eq!(num_leading, c.leading_nonstarters);
171            assert_eq!(num_trailing, c.trailing_nonstarters);
172        }
173    }
174}
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy