Skip to content

Commit 887d390

Browse files
Add API to transform into KS X 1026-1 standard Korean syllables
Gated behind the `ks_x_1026-1` feature.
1 parent c24ac7f commit 887d390

File tree

3 files changed

+262
-60
lines changed

3 files changed

+262
-60
lines changed

src/lib.rs

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,10 @@ pub use crate::quick_check::{
7373
};
7474
pub use crate::recompose::Recompositions;
7575
pub use crate::replace::Replacements;
76-
pub use crate::standardize_korean_syllables::StandardKoreanSyllables;
76+
pub use crate::standardize_korean_syllables::StandardizeKoreanSyllables;
77+
#[cfg(feature = "ks_x_1026-1")]
78+
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
79+
pub use crate::standardize_korean_syllables::StandardizeKoreanSyllablesKsX1026_1;
7780
pub use crate::stream_safe::StreamSafe;
7881
pub use crate::tables::UNICODE_VERSION;
7982
use core::{option, str::Chars};
@@ -148,9 +151,9 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
148151
/// inserted according to the Stream-Safe Text Process ([UAX15-D4](https://unicode.org/reports/tr15/#UAX15-D4))
149152
fn stream_safe(self) -> StreamSafe<I>;
150153

151-
/// An iterator over the string with Hangul choseong and jugseong filler characters inserted
154+
/// An iterator over the string with Hangul choseong and jungseong filler characters inserted
152155
/// to ensure that all Korean syllable blocks are in standard form according to [UAX29](https://www.unicode.org/reports/tr29/#Transforming_Into_SKS).
153-
fn standard_korean_syllables(self) -> StandardKoreanSyllables<I>;
156+
fn standard_korean_syllables(self) -> StandardizeKoreanSyllables<I>;
154157

155158
/// An iterator over the string in the variant of Unicode Normalization Form KD
156159
/// defined by Korean Standard X 1026-1. This normalization differs from that defined by Unicode
@@ -183,6 +186,12 @@ pub trait UnicodeNormalization<I: Iterator<Item = char>> {
183186
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
184187

185188
fn nfkc_ks_x_1026_1(self) -> RecomposeHangul<Recompositions<NormalizeJamoKdkc<I>>>;
189+
190+
/// An iterator over the string with Hangul choseong and jungseong filler characters inserted
191+
/// to ensure that all Korean syllable blocks are in standard form according to KS X 1026-1 § 7.8.
192+
#[cfg(feature = "ks_x_1026-1")]
193+
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
194+
fn standard_korean_syllables_ks_x_1026_1(self) -> StandardizeKoreanSyllablesKsX1026_1<I>;
186195
}
187196

188197
impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
@@ -217,8 +226,8 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
217226
}
218227

219228
#[inline]
220-
fn standard_korean_syllables(self) -> StandardKoreanSyllables<Chars<'a>> {
221-
StandardKoreanSyllables::new(self.chars())
229+
fn standard_korean_syllables(self) -> StandardizeKoreanSyllables<Chars<'a>> {
230+
StandardizeKoreanSyllables::new(self.chars())
222231
}
223232

224233
#[cfg(feature = "ks_x_1026-1")]
@@ -243,6 +252,14 @@ impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
243252
self.chars(),
244253
)))
245254
}
255+
256+
#[cfg(feature = "ks_x_1026-1")]
257+
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
258+
fn standard_korean_syllables_ks_x_1026_1(
259+
self,
260+
) -> StandardizeKoreanSyllablesKsX1026_1<Chars<'a>> {
261+
StandardizeKoreanSyllablesKsX1026_1::new(self.chars())
262+
}
246263
}
247264

248265
impl UnicodeNormalization<option::IntoIter<char>> for char {
@@ -277,8 +294,8 @@ impl UnicodeNormalization<option::IntoIter<char>> for char {
277294
}
278295

279296
#[inline]
280-
fn standard_korean_syllables(self) -> StandardKoreanSyllables<option::IntoIter<char>> {
281-
StandardKoreanSyllables::new(Some(self).into_iter())
297+
fn standard_korean_syllables(self) -> StandardizeKoreanSyllables<option::IntoIter<char>> {
298+
StandardizeKoreanSyllables::new(Some(self).into_iter())
282299
}
283300

284301
#[cfg(feature = "ks_x_1026-1")]
@@ -305,6 +322,14 @@ impl UnicodeNormalization<option::IntoIter<char>> for char {
305322
Some(self).into_iter(),
306323
)))
307324
}
325+
326+
#[cfg(feature = "ks_x_1026-1")]
327+
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
328+
fn standard_korean_syllables_ks_x_1026_1(
329+
self,
330+
) -> StandardizeKoreanSyllablesKsX1026_1<option::IntoIter<char>> {
331+
StandardizeKoreanSyllablesKsX1026_1::new(Some(self).into_iter())
332+
}
308333
}
309334

310335
impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
@@ -339,8 +364,8 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
339364
}
340365

341366
#[inline]
342-
fn standard_korean_syllables(self) -> StandardKoreanSyllables<I> {
343-
StandardKoreanSyllables::new(self)
367+
fn standard_korean_syllables(self) -> StandardizeKoreanSyllables<I> {
368+
StandardizeKoreanSyllables::new(self)
344369
}
345370

346371
#[cfg(feature = "ks_x_1026-1")]
@@ -363,4 +388,10 @@ impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
363388
fn nfkc_ks_x_1026_1(self) -> RecomposeHangul<Recompositions<NormalizeJamoKdkc<I>>> {
364389
RecomposeHangul::new(recompose::new_compatible(NormalizeJamoKdkc::new(self)))
365390
}
391+
392+
#[cfg(feature = "ks_x_1026-1")]
393+
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
394+
fn standard_korean_syllables_ks_x_1026_1(self) -> StandardizeKoreanSyllablesKsX1026_1<I> {
395+
StandardizeKoreanSyllablesKsX1026_1::new(self)
396+
}
366397
}

src/standardize_korean_syllables.rs

Lines changed: 172 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use core::iter::FusedIterator;
1+
use core::{iter::FusedIterator, marker::PhantomData};
22

33
use tinyvec::ArrayVec;
44

@@ -42,17 +42,27 @@ impl JamoKind {
4242
}
4343
}
4444

45-
/// Iterator over a string's characters, with '\u{115F}' and '\u{1160}' inserted
46-
/// where needed to ensure all Korean syllable blocks are in standard form
47-
/// by [UAX29 rules](https://www.unicode.org/reports/tr29/#Standard_Korean_Syllables).
45+
trait NormalizeKoreanSyllables {
46+
fn insert_fillers(
47+
next_c: Option<char>,
48+
prev_end_jamo_kind: Option<JamoKind>,
49+
next_start_jamo_kind: Option<JamoKind>,
50+
buf: &mut ArrayVec<[Option<char>; 3]>,
51+
) -> Option<char>;
52+
}
53+
54+
// Used to abstract over UAX29 and KS X 1026-1 rules
4855
#[derive(Clone, Debug)]
49-
pub struct StandardKoreanSyllables<I> {
56+
struct StandardizeKoreanSyllablesInner<I, N> {
5057
prev_end_jamo_kind: Option<JamoKind>,
5158
buf: ArrayVec<[Option<char>; 3]>,
5259
inner: I,
60+
normalizer: PhantomData<N>,
5361
}
5462

55-
impl<I: Iterator<Item = char>> Iterator for StandardKoreanSyllables<I> {
63+
impl<I: Iterator<Item = char>, N: NormalizeKoreanSyllables> Iterator
64+
for StandardizeKoreanSyllablesInner<I, N>
65+
{
5666
type Item = char;
5767

5868
fn next(&mut self) -> Option<Self::Item> {
@@ -65,7 +75,7 @@ impl<I: Iterator<Item = char>> Iterator for StandardKoreanSyllables<I> {
6575
next_c.map_or((None, None), JamoKind::of);
6676
self.prev_end_jamo_kind = next_end_jamo_kind;
6777

68-
insert_fillers(
78+
N::insert_fillers(
6979
next_c,
7080
prev_end_jamo_kind,
7181
next_start_jamo_kind,
@@ -87,50 +97,169 @@ impl<I: Iterator<Item = char>> Iterator for StandardKoreanSyllables<I> {
8797
}
8898
}
8999

90-
impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for StandardKoreanSyllables<I> {}
100+
impl<I: Iterator<Item = char> + FusedIterator, N: NormalizeKoreanSyllables> FusedIterator
101+
for StandardizeKoreanSyllablesInner<I, N>
102+
{
103+
}
91104

92-
#[inline]
93-
fn insert_fillers(
94-
next_c: Option<char>,
95-
prev_end_jamo_kind: Option<JamoKind>,
96-
next_start_jamo_kind: Option<JamoKind>,
97-
buf: &mut ArrayVec<[Option<char>; 3]>,
98-
) -> Option<char> {
99-
match (prev_end_jamo_kind, next_start_jamo_kind) {
100-
// Insert choseong filler before V not preceded by L or V
101-
(None, Some(JamoKind::V)) | (Some(JamoKind::T), Some(JamoKind::V)) => {
102-
buf.push(next_c);
103-
Some('\u{115F}')
104-
}
105-
// Insert choseong and jungseong fillers before T preceded non-jamo
106-
(None, Some(JamoKind::T)) => {
107-
buf.push(next_c);
108-
buf.push(Some('\u{1160}'));
109-
Some('\u{115F}')
110-
}
111-
// Insert V filler between L and non-jamo
112-
(Some(JamoKind::L), None) => {
113-
buf.push(next_c);
114-
Some('\u{1160}')
105+
impl<I, N> StandardizeKoreanSyllablesInner<I, N> {
106+
#[inline]
107+
fn new(iter: I) -> Self {
108+
Self {
109+
prev_end_jamo_kind: None,
110+
buf: ArrayVec::new(),
111+
inner: iter,
112+
normalizer: PhantomData,
115113
}
116-
// For L followed by T, insert V filler, L filler, then another V filler
117-
(Some(JamoKind::L), Some(JamoKind::T)) => {
118-
buf.push(next_c);
119-
buf.push(Some('\u{1160}'));
120-
buf.push(Some('\u{115F}'));
121-
Some('\u{1160}')
114+
}
115+
}
116+
117+
// UAX 29 normalization
118+
119+
#[derive(Clone, Debug)]
120+
struct Uax29;
121+
122+
impl NormalizeKoreanSyllables for Uax29 {
123+
#[inline]
124+
fn insert_fillers(
125+
next_c: Option<char>,
126+
prev_end_jamo_kind: Option<JamoKind>,
127+
next_start_jamo_kind: Option<JamoKind>,
128+
buf: &mut ArrayVec<[Option<char>; 3]>,
129+
) -> Option<char> {
130+
match (prev_end_jamo_kind, next_start_jamo_kind) {
131+
// Insert choseong filler before V not preceded by L or V
132+
(None, Some(JamoKind::V)) | (Some(JamoKind::T), Some(JamoKind::V)) => {
133+
buf.push(next_c);
134+
Some('\u{115F}')
135+
}
136+
// Insert choseong and jungseong fillers before T preceded non-jamo
137+
(None, Some(JamoKind::T)) => {
138+
buf.push(next_c);
139+
buf.push(Some('\u{1160}'));
140+
Some('\u{115F}')
141+
}
142+
// Insert V filler between L and non-jamo
143+
(Some(JamoKind::L), None) => {
144+
buf.push(next_c);
145+
Some('\u{1160}')
146+
}
147+
// For L followed by T, insert V filler, L filler, then another V filler
148+
(Some(JamoKind::L), Some(JamoKind::T)) => {
149+
buf.push(next_c);
150+
buf.push(Some('\u{1160}'));
151+
buf.push(Some('\u{115F}'));
152+
Some('\u{1160}')
153+
}
154+
_ => next_c,
122155
}
123-
_ => next_c,
124156
}
125157
}
126158

127-
impl<I> StandardKoreanSyllables<I> {
159+
/// Iterator over a string's characters, with U+115F and U+1160 inserted
160+
/// where needed to ensure all Korean syllable blocks are in standard form
161+
/// by [UAX29 rules](https://www.unicode.org/reports/tr29/#Standard_Korean_Syllables).
162+
#[derive(Clone, Debug)]
163+
pub struct StandardizeKoreanSyllables<I>(StandardizeKoreanSyllablesInner<I, Uax29>);
164+
165+
impl<I> StandardizeKoreanSyllables<I> {
128166
#[inline]
129167
pub(crate) fn new(iter: I) -> Self {
130-
Self {
131-
prev_end_jamo_kind: None,
132-
buf: ArrayVec::new(),
133-
inner: iter,
168+
Self(StandardizeKoreanSyllablesInner::new(iter))
169+
}
170+
}
171+
172+
impl<I: Iterator<Item = char>> Iterator for StandardizeKoreanSyllables<I> {
173+
type Item = char;
174+
175+
fn next(&mut self) -> Option<Self::Item> {
176+
self.0.next()
177+
}
178+
179+
fn size_hint(&self) -> (usize, Option<usize>) {
180+
self.0.size_hint()
181+
}
182+
}
183+
184+
impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for StandardizeKoreanSyllables<I> {}
185+
186+
// KS X 1026 1 normalization
187+
188+
#[cfg(feature = "ks_x_1026-1")]
189+
#[derive(Clone, Debug)]
190+
struct KsX1026_1;
191+
192+
#[cfg(feature = "ks_x_1026-1")]
193+
impl NormalizeKoreanSyllables for KsX1026_1 {
194+
#[inline]
195+
fn insert_fillers(
196+
next_c: Option<char>,
197+
prev_end_jamo_kind: Option<JamoKind>,
198+
next_start_jamo_kind: Option<JamoKind>,
199+
buf: &mut ArrayVec<[Option<char>; 3]>,
200+
) -> Option<char> {
201+
match (prev_end_jamo_kind, next_start_jamo_kind) {
202+
// Insert choseong filler before V preceded by V, T or non-jamo
203+
(None, Some(JamoKind::V))
204+
| (Some(JamoKind::V), Some(JamoKind::V))
205+
| (Some(JamoKind::T), Some(JamoKind::V)) => {
206+
buf.push(next_c);
207+
Some('\u{115F}')
208+
}
209+
// Insert choseong and jungseong fillers before T preceded by T or non-jamo
210+
(None, Some(JamoKind::T)) | (Some(JamoKind::T), Some(JamoKind::T)) => {
211+
buf.push(next_c);
212+
buf.push(Some('\u{1160}'));
213+
Some('\u{115F}')
214+
}
215+
// Insert V filler between L and non-jamo or other L
216+
(Some(JamoKind::L), None) | (Some(JamoKind::L), Some(JamoKind::L)) => {
217+
buf.push(next_c);
218+
Some('\u{1160}')
219+
}
220+
// For L followed by T, insert V filler, L filler, then another V filler
221+
(Some(JamoKind::L), Some(JamoKind::T)) => {
222+
buf.push(next_c);
223+
buf.push(Some('\u{1160}'));
224+
buf.push(Some('\u{115F}'));
225+
Some('\u{1160}')
226+
}
227+
_ => next_c,
134228
}
135229
}
136230
}
231+
232+
/// Iterator over a string's characters, with U+115F and U+1160 inserted
233+
/// where needed to ensure all Korean syllable blocks are in standard form
234+
/// by [KS X 1026-1](http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3422.pdf) rules.
235+
#[cfg(feature = "ks_x_1026-1")]
236+
#[cfg_attr(docsrs, doc(cfg(feature = "ks_x_1026-1")))]
237+
#[derive(Clone, Debug)]
238+
pub struct StandardizeKoreanSyllablesKsX1026_1<I>(StandardizeKoreanSyllablesInner<I, KsX1026_1>);
239+
240+
#[cfg(feature = "ks_x_1026-1")]
241+
impl<I> StandardizeKoreanSyllablesKsX1026_1<I> {
242+
#[inline]
243+
pub(crate) fn new(iter: I) -> Self {
244+
Self(StandardizeKoreanSyllablesInner::new(iter))
245+
}
246+
}
247+
248+
#[cfg(feature = "ks_x_1026-1")]
249+
impl<I: Iterator<Item = char>> Iterator for StandardizeKoreanSyllablesKsX1026_1<I> {
250+
type Item = char;
251+
252+
fn next(&mut self) -> Option<Self::Item> {
253+
self.0.next()
254+
}
255+
256+
fn size_hint(&self) -> (usize, Option<usize>) {
257+
self.0.size_hint()
258+
}
259+
}
260+
261+
#[cfg(feature = "ks_x_1026-1")]
262+
impl<I: Iterator<Item = char> + FusedIterator> FusedIterator
263+
for StandardizeKoreanSyllablesKsX1026_1<I>
264+
{
265+
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy