diff --git a/src/normalize.rs b/src/normalize.rs index e878642..e59b667 100644 --- a/src/normalize.rs +++ b/src/normalize.rs @@ -71,6 +71,7 @@ where } #[inline] +#[allow(unsafe_code)] fn decompose(c: char, decompose_char: D, mut emit_char: F) where D: Fn(char) -> Option<&'static [char]>, @@ -84,7 +85,10 @@ where // Perform decomposition for Hangul if is_hangul_syllable(c) { - decompose_hangul(c, emit_char); + // Safety: Hangul Syllables invariant checked by is_hangul_syllable above + unsafe { + decompose_hangul(c, emit_char); + } return; } @@ -127,27 +131,37 @@ const T_LAST: u32 = T_BASE + T_COUNT - 1; // i.e. `T_BASE + 1 ..= T_LAST`. const T_FIRST: u32 = T_BASE + 1; +// Safety-usable invariant: This ensures that c is a valid Hangul Syllable character (U+AC00..U+D7AF) pub(crate) fn is_hangul_syllable(c: char) -> bool { + // Safety: This checks the range 0xAC00 (S_BASE) to 0xD7A4 (S_BASE + S_COUNT), upholding the safety-usable invariant (c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT) } // Decompose a precomposed Hangul syllable -#[allow(unsafe_code)] +// Safety: `s` MUST be a valid Hangul Syllable character, between U+AC00..U+D7AF +#[allow(unsafe_code, unused_unsafe)] #[inline(always)] -fn decompose_hangul(s: char, mut emit_char: F) +unsafe fn decompose_hangul(s: char, mut emit_char: F) where F: FnMut(char), { + // This will be at most 0x2baf, the size of the Hangul Syllables block let s_index = s as u32 - S_BASE; + // This will be at most 0x2baf / (21 * 28), 19 let l_index = s_index / N_COUNT; unsafe { + // Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) emit_char(char::from_u32_unchecked(L_BASE + l_index)); + // Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21 let v_index = (s_index % N_COUNT) / T_COUNT; + // Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) emit_char(char::from_u32_unchecked(V_BASE + v_index)); + // Safety: This will be at most T_COUNT - 1 (27) let t_index = s_index % T_COUNT; if t_index > 0 { + // Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800) emit_char(char::from_u32_unchecked(T_BASE + t_index)); } } @@ -173,14 +187,23 @@ fn compose_hangul(a: char, b: char) -> Option { match (a, b) { // Compose a leading consonant and a vowel together into an LV_Syllable (L_BASE..=L_LAST, V_BASE..=V_LAST) => { + // Safety: based on the above bounds, l_index will be less than or equal to L_COUNT (19) + // and v_index will be <= V_COUNT (21) let l_index = a - L_BASE; let v_index = b - V_BASE; + // Safety: This will be <= 19 * (20 * 21) + (21 * 20), which is 8400. let lv_index = l_index * N_COUNT + v_index * T_COUNT; + // Safety: This is between 0xAC00 and 0xCCD0, which are in range for Hangul Syllables (U+AC00..U+D7AF) and also in range + // for BMP unicode let s = S_BASE + lv_index; + // Safety: We've verified this is in-range Some(unsafe { char::from_u32_unchecked(s) }) } // Compose an LV_Syllable and a trailing consonant into an LVT_Syllable (S_BASE..=S_LAST, T_FIRST..=T_LAST) if (a - S_BASE) % T_COUNT == 0 => { + // Safety: a is between 0xAC00 and (0xAC00 + 19 * 21 * 28). b - T_BASE is between 0 and 19. + // Adding a number 0 to 19 to a number that is at largest 0xD7A4 will not go out of bounds to 0xD800 (where the + // surrogates start), so this is safe. Some(unsafe { char::from_u32_unchecked(a + (b - T_BASE)) }) } _ => None, pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy