Skip to content

Commit 79687ca

Browse files
authored
Merge pull request #100 from unicode-rs/safety-comments
Add safety comments
2 parents 71a54fa + a97388a commit 79687ca

File tree

1 file changed

+26
-3
lines changed

1 file changed

+26
-3
lines changed

src/normalize.rs

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ where
7171
}
7272

7373
#[inline]
74+
#[allow(unsafe_code)]
7475
fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
7576
where
7677
D: Fn(char) -> Option<&'static [char]>,
@@ -84,7 +85,10 @@ where
8485

8586
// Perform decomposition for Hangul
8687
if is_hangul_syllable(c) {
87-
decompose_hangul(c, emit_char);
88+
// Safety: Hangul Syllables invariant checked by is_hangul_syllable above
89+
unsafe {
90+
decompose_hangul(c, emit_char);
91+
}
8892
return;
8993
}
9094

@@ -127,27 +131,37 @@ const T_LAST: u32 = T_BASE + T_COUNT - 1;
127131
// i.e. `T_BASE + 1 ..= T_LAST`.
128132
const T_FIRST: u32 = T_BASE + 1;
129133

134+
// Safety-usable invariant: This ensures that c is a valid Hangul Syllable character (U+AC00..U+D7AF)
130135
pub(crate) fn is_hangul_syllable(c: char) -> bool {
136+
// Safety: This checks the range 0xAC00 (S_BASE) to 0xD7A4 (S_BASE + S_COUNT), upholding the safety-usable invariant
131137
(c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT)
132138
}
133139

134140
// Decompose a precomposed Hangul syllable
135-
#[allow(unsafe_code)]
141+
// Safety: `s` MUST be a valid Hangul Syllable character, between U+AC00..U+D7AF
142+
#[allow(unsafe_code, unused_unsafe)]
136143
#[inline(always)]
137-
fn decompose_hangul<F>(s: char, mut emit_char: F)
144+
unsafe fn decompose_hangul<F>(s: char, mut emit_char: F)
138145
where
139146
F: FnMut(char),
140147
{
148+
// This will be at most 0x2baf, the size of the Hangul Syllables block
141149
let s_index = s as u32 - S_BASE;
150+
// This will be at most 0x2baf / (21 * 28), 19
142151
let l_index = s_index / N_COUNT;
143152
unsafe {
153+
// Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800)
144154
emit_char(char::from_u32_unchecked(L_BASE + l_index));
145155

156+
// Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21
146157
let v_index = (s_index % N_COUNT) / T_COUNT;
158+
// Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800)
147159
emit_char(char::from_u32_unchecked(V_BASE + v_index));
148160

161+
// Safety: This will be at most T_COUNT - 1 (27)
149162
let t_index = s_index % T_COUNT;
150163
if t_index > 0 {
164+
// Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800)
151165
emit_char(char::from_u32_unchecked(T_BASE + t_index));
152166
}
153167
}
@@ -173,14 +187,23 @@ fn compose_hangul(a: char, b: char) -> Option<char> {
173187
match (a, b) {
174188
// Compose a leading consonant and a vowel together into an LV_Syllable
175189
(L_BASE..=L_LAST, V_BASE..=V_LAST) => {
190+
// Safety: based on the above bounds, l_index will be less than or equal to L_COUNT (19)
191+
// and v_index will be <= V_COUNT (21)
176192
let l_index = a - L_BASE;
177193
let v_index = b - V_BASE;
194+
// Safety: This will be <= 19 * (20 * 21) + (21 * 20), which is 8400.
178195
let lv_index = l_index * N_COUNT + v_index * T_COUNT;
196+
// Safety: This is between 0xAC00 and 0xCCD0, which are in range for Hangul Syllables (U+AC00..U+D7AF) and also in range
197+
// for BMP unicode
179198
let s = S_BASE + lv_index;
199+
// Safety: We've verified this is in-range
180200
Some(unsafe { char::from_u32_unchecked(s) })
181201
}
182202
// Compose an LV_Syllable and a trailing consonant into an LVT_Syllable
183203
(S_BASE..=S_LAST, T_FIRST..=T_LAST) if (a - S_BASE) % T_COUNT == 0 => {
204+
// Safety: a is between 0xAC00 and (0xAC00 + 19 * 21 * 28). b - T_BASE is between 0 and 19.
205+
// Adding a number 0 to 19 to a number that is at largest 0xD7A4 will not go out of bounds to 0xD800 (where the
206+
// surrogates start), so this is safe.
184207
Some(unsafe { char::from_u32_unchecked(a + (b - T_BASE)) })
185208
}
186209
_ => None,

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy