Skip to content

Commit b726540

Browse files
authored
Merge branch 'master' into patch-1
2 parents 0772fc9 + 9b20974 commit b726540

File tree

7 files changed

+124
-18
lines changed

7 files changed

+124
-18
lines changed

fuzz/Cargo.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,11 @@ path = "fuzz_targets/streaming.rs"
2323
test = false
2424
doc = false
2525

26+
[[bin]]
27+
name = "process"
28+
path = "fuzz_targets/process.rs"
29+
test = false
30+
doc = false
31+
2632
# Work around https://github.com/rust-lang/cargo/issues/8338
2733
[workspace]

fuzz/fuzz_targets/process.rs

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
// The fuzzing harness fuzz test some of the the
2+
// unicode string normalization processing
3+
4+
#![no_main]
5+
6+
#[macro_use]
7+
extern crate libfuzzer_sys;
8+
extern crate unicode_normalization;
9+
10+
use unicode_normalization::{
11+
char::{
12+
canonical_combining_class, compose, decompose_canonical, decompose_compatible,
13+
is_combining_mark,
14+
},
15+
UnicodeNormalization,
16+
};
17+
18+
fuzz_target!(|data: (u8, String)| {
19+
let (function_index, string_data) = data;
20+
21+
// Create an iterator for characters
22+
let mut chars = string_data.chars();
23+
24+
// Randomly fuzz a target function
25+
match function_index % 10 {
26+
0 => {
27+
// Fuzz compose with two distinct characters
28+
if let (Some(c1), Some(c2)) = (chars.next(), chars.next()) {
29+
let _ = compose(c1, c2);
30+
}
31+
}
32+
1 => {
33+
// Fuzz canonical_combining_class
34+
if let Some(c) = chars.next() {
35+
let _ = canonical_combining_class(c);
36+
}
37+
}
38+
2 => {
39+
// Fuzz is_combining_mark
40+
if let Some(c) = chars.next() {
41+
let _ = is_combining_mark(c);
42+
}
43+
}
44+
3 => {
45+
// Fuzz NFC
46+
let _ = string_data.nfc().collect::<String>();
47+
}
48+
4 => {
49+
// Fuzz NFKD
50+
let _ = string_data.nfkd().collect::<String>();
51+
}
52+
5 => {
53+
// Fuzz NFD
54+
let _ = string_data.nfd().collect::<String>();
55+
}
56+
6 => {
57+
// Fuzz NFKC
58+
let _ = string_data.nfkc().collect::<String>();
59+
}
60+
7 => {
61+
// Fuzz stream_safe
62+
let _ = string_data.stream_safe().collect::<String>();
63+
}
64+
8 => {
65+
// Fuzz decompose_canonical
66+
if let Some(c) = chars.next() {
67+
decompose_canonical(c, |_| {});
68+
}
69+
}
70+
9 => {
71+
// Fuzz decompose_compatible
72+
if let Some(c) = chars.next() {
73+
decompose_compatible(c, |_| {});
74+
}
75+
}
76+
_ => {}
77+
}
78+
});

src/decompose.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ impl<I: Iterator<Item = char>> Decompositions<I> {
5353

5454
/// Create a new decomposition iterator for compatability decompositions (NFkD)
5555
///
56-
/// Note that this iterator can also be obtained by directly calling [`.nfd()`](crate::UnicodeNormalization::nfd)
56+
/// Note that this iterator can also be obtained by directly calling [`.nfkd()`](crate::UnicodeNormalization::nfkd)
5757
/// on the iterator.
5858
#[inline]
5959
pub fn new_compatible(iter: I) -> Decompositions<I> {

src/lib.rs

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,8 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
113113
/// (compatibility decomposition followed by canonical composition).
114114
fn nfkc(self) -> Recompositions<I>;
115115

116-
/// A transformation which replaces CJK Compatibility Ideograph codepoints
117-
/// with normal forms using Standardized Variation Sequences. This is not
116+
/// A transformation which replaces [CJK Compatibility Ideograph] codepoints
117+
/// with normal forms using [Standardized Variation Sequences]. This is not
118118
/// part of the canonical or compatibility decomposition algorithms, but
119119
/// performing it before those algorithms produces normalized output which
120120
/// better preserves the intent of the original text.
@@ -123,10 +123,15 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
123123
/// may not immediately help text display as intended, but they at
124124
/// least preserve the information in a standardized form, giving
125125
/// implementations the option to recognize them.
126+
///
127+
/// [CJK Compatibility Ideograph]: https://www.unicode.org/glossary/#compatibility_ideograph
128+
/// [Standardized Variation Sequences]: https://www.unicode.org/glossary/#standardized_variation_sequence
126129
fn cjk_compat_variants(self) -> Replacements<I>;
127130

128131
/// An Iterator over the string with Conjoining Grapheme Joiner characters
129-
/// inserted according to the Stream-Safe Text Process (UAX15-D4)
132+
/// inserted according to the Stream-Safe Text Process ([UAX15-D4]).
133+
///
134+
/// [UAX15-D4]: https://www.unicode.org/reports/tr15/#UAX15-D4
130135
fn stream_safe(self) -> StreamSafe<I>;
131136
}
132137

@@ -153,7 +158,7 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
153158

154159
#[inline]
155160
fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
156-
replace::new_cjk_compat_variants(self.chars())
161+
Replacements::new_cjk_compat_variants(self.chars())
157162
}
158163

159164
#[inline]
@@ -185,7 +190,7 @@ impl UnicodeNormalization<option::IntoIter<char>> for char {
185190

186191
#[inline]
187192
fn cjk_compat_variants(self) -> Replacements<option::IntoIter<char>> {
188-
replace::new_cjk_compat_variants(Some(self).into_iter())
193+
Replacements::new_cjk_compat_variants(Some(self).into_iter())
189194
}
190195

191196
#[inline]
@@ -217,7 +222,7 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
217222

218223
#[inline]
219224
fn cjk_compat_variants(self) -> Replacements<I> {
220-
replace::new_cjk_compat_variants(self)
225+
Replacements::new_cjk_compat_variants(self)
221226
}
222227

223228
#[inline]

src/normalize.rs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,12 @@ pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
4141
///
4242
/// [Standardized Variation Sequences] are used instead of the standard canonical
4343
/// decompositions, notably for CJK codepoints with singleton canonical decompositions,
44-
/// to avoid losing information. See the
45-
/// [Unicode Variation Sequence FAQ](http://unicode.org/faq/vs.html) and the
46-
/// "Other Enhancements" section of the
47-
/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
48-
/// for more information.
44+
/// to avoid losing information. See the [Unicode Variation Sequence FAQ] and the
45+
/// "Other Enhancements" section of the [Unicode 6.3 Release Summary] for more information.
46+
///
47+
/// [Standardized Variation Sequences]: https://www.unicode.org/glossary/#standardized_variation_sequence
48+
/// [Unicode Variation Sequence FAQ]: http://unicode.org/faq/vs.html
49+
/// [Unicode 6.3 Release Summary]: https://www.unicode.org/versions/Unicode6.3.0/#Summary
4950
#[inline]
5051
pub fn decompose_cjk_compat_variants<F>(c: char, mut emit_char: F)
5152
where

src/replace.rs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,18 @@ pub struct Replacements<I> {
2222
buffer: Option<char>,
2323
}
2424

25-
#[inline]
26-
pub fn new_cjk_compat_variants<I: Iterator<Item = char>>(iter: I) -> Replacements<I> {
27-
Replacements { iter, buffer: None }
25+
impl<I: Iterator<Item = char>> Replacements<I> {
26+
/// Create a new iterator that replaces [CJK Compatibility Ideograph] codepoints with normal forms using [Standardized Variation Sequences].
27+
///
28+
/// Note that this iterator can also be obtained by directly calling [`.cjk_compat_variants()`] on the iterator.
29+
///
30+
/// [CJK Compatibility Ideograph]: https://www.unicode.org/glossary/#compatibility_ideograph
31+
/// [Standardized Variation Sequences]: https://www.unicode.org/glossary/#standardized_variation_sequence
32+
/// [`.cjk_compat_variants()`]: crate::UnicodeNormalization::cjk_compat_variants
33+
#[inline]
34+
pub fn new_cjk_compat_variants(iter: I) -> Replacements<I> {
35+
Replacements { iter, buffer: None }
36+
}
2837
}
2938

3039
impl<I: Iterator<Item = char>> Iterator for Replacements<I> {

src/stream_safe.rs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,24 @@ use crate::tables::stream_safe_leading_nonstarters;
1010
pub(crate) const MAX_NONSTARTERS: usize = 30;
1111
const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
1212

13-
/// UAX15-D4: This iterator keeps track of how many non-starters there have been
13+
/// [UAX15-D4]: This iterator keeps track of how many non-starters there have been
1414
/// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
1515
/// (U+034F) if the count exceeds 30.
16+
///
17+
/// [UAX15-D4]: https://www.unicode.org/reports/tr15/#UAX15-D4
1618
pub struct StreamSafe<I> {
1719
iter: I,
1820
nonstarter_count: usize,
1921
buffer: Option<char>,
2022
}
2123

22-
impl<I> StreamSafe<I> {
23-
pub(crate) fn new(iter: I) -> Self {
24+
impl<I: Iterator<Item = char>> StreamSafe<I> {
25+
/// Create a new stream safe iterator.
26+
///
27+
/// Note that this iterator can also be obtained by directly calling [`.stream_safe()`](crate::UnicodeNormalization::stream_safe)
28+
/// on the iterator.
29+
#[inline]
30+
pub fn new(iter: I) -> Self {
2431
Self {
2532
iter,
2633
nonstarter_count: 0,

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy