Skip to content

Commit 6b86cc2

Browse files
Support KS X 1026-1
1 parent a6a221a commit 6b86cc2

File tree

7 files changed

+503
-29
lines changed

7 files changed

+503
-29
lines changed

.github/workflows/rust.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ env:
1111
CARGO_TERM_COLOR: always
1212
RUST_BACKTRACE: 1
1313
RUSTFLAGS: -D warnings
14-
RUSTDOCFLAGS: -D warnings --cfg docsrs
14+
RUSTDOCFLAGS: -D warnings
1515

1616
jobs:
1717
build:
@@ -43,6 +43,8 @@ jobs:
4343
run: cd $(find target/package/ -maxdepth 1 -mindepth 1 -type d) && cargo test --no-default-features
4444
- name: Build docs
4545
if: matrix.rust == 'nightly'
46+
env:
47+
RUSTDOCFLAGS: -D warnings --cfg docsrs
4648
run: cargo doc --all-features --verbose
4749
- name: Check formatting
4850
if: matrix.rust == 'stable'

Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,8 @@ features = ["alloc"]
4040

4141
[features]
4242
default = ["std"]
43+
ks_x_1026-1 = []
4344
std = []
45+
46+
[package.metadata.docs.rs]
47+
rustc-args = ["--cfg", "feature=\"ks_x_1026-1\""]

README.md

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ fn main() {
2626

2727
## crates.io
2828

29-
You can use this package in your project by adding the following
30-
to your `Cargo.toml`:
29+
You can use this package in your project by adding the following to your
30+
`Cargo.toml`:
3131

3232
```toml
3333
[dependencies]
@@ -36,4 +36,15 @@ unicode-normalization = "0.1.23"
3636

3737
## `no_std` + `alloc` support
3838

39-
This crate is completely `no_std` + `alloc` compatible. This can be enabled by disabling the `std` feature, i.e. specifying `default-features = false` for this crate on your `Cargo.toml`.
39+
This crate is completely `no_std` + `alloc` compatible. This can be enabled by
40+
disabling the `std` feature, i.e. specifying `default-features = false` for this
41+
crate on your `Cargo.toml`.
42+
43+
## KS X 1026-1
44+
45+
Korean Standard KS X 1026-1 ([Korean](https://standard.go.kr/KSCI/standardIntro/getStandardSearchView.do?ksNo=KSX1026-1),
46+
[English](http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3422.pdf)) is an ROK government
47+
standard that corrects some defects and makes some changes to the Unicode NFC,
48+
NFKC, and NFKD normalization forms for certain Korean characters. The
49+
`ks_x_1026-1` crate feature (disabled by default) adds methods to support these
50+
alternate normalizations.

src/ks_x_1026_1.rs

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
//! <http://std.dkuug.dk/jtc1/sc2/wg2/docs/n3422.pdf> Annex B
2+
3+
use core::{
4+
convert::{TryFrom, TryInto},
5+
iter::FusedIterator,
6+
};
7+
8+
use tinyvec::ArrayVec;
9+
10+
// § B.1.1
11+
12+
use crate::normalize::hangul_constants::{
13+
L_BASE, L_LAST, N_COUNT, S_BASE, S_COUNT, T_BASE, T_COUNT, T_LAST, V_BASE, V_LAST,
14+
};
15+
16+
// § B.1.2
17+
18+
fn is_old_jongseong(t: char) -> bool {
19+
match t {
20+
'\u{11C3}'..='\u{11FF}' | '\u{D7CB}'..='\u{D7FB}' => true,
21+
_ => false,
22+
}
23+
}
24+
25+
/// Iterator that decomposes modern Hangul LV syllables immediately followed by old Hangul T jamo
26+
/// into a 3-character L V T sequences, as specified in KS X 1026-1 annex B.1.5.
27+
#[derive(Clone, Debug)]
28+
pub struct RecomposeHangul<I> {
29+
/// Medial vowel of a decomposed LV syllable
30+
v: Option<char>,
31+
/// Character yielded by inner iterator in last call to its `next()`
32+
last: Option<char>,
33+
inner: I,
34+
}
35+
36+
impl<I: Iterator<Item = char>> Iterator for RecomposeHangul<I> {
37+
type Item = char;
38+
39+
fn next(&mut self) -> Option<Self::Item> {
40+
if let Some(v) = self.v {
41+
// If an LV syllable was decomposed in the last call to `next`,
42+
// yield its medial vowel.
43+
self.v = None;
44+
Some(v)
45+
} else {
46+
let prev = self.last;
47+
self.last = self.inner.next();
48+
49+
if let (Some(prev), Some(next)) = (prev, self.last) {
50+
let s_index = u32::from(prev).wrapping_sub(S_BASE);
51+
if s_index < S_COUNT && s_index % T_COUNT == 0 && is_old_jongseong(next) {
52+
// We have an LV syllable followed by an old jongseong, decompose into L V
53+
let l: char = (L_BASE + s_index / N_COUNT).try_into().unwrap();
54+
self.v = Some((V_BASE + (s_index % N_COUNT) / T_COUNT).try_into().unwrap());
55+
return Some(l);
56+
}
57+
}
58+
59+
prev
60+
}
61+
}
62+
63+
#[inline]
64+
fn size_hint(&self) -> (usize, Option<usize>) {
65+
let (inner_lo, inner_hi) = self.inner.size_hint();
66+
let add_factor: usize = self.v.map_or(0, |_| 1) + self.last.map_or(0, |_| 1);
67+
(
68+
inner_lo.saturating_add(add_factor),
69+
inner_hi
70+
.and_then(|h| h.checked_mul(2))
71+
.and_then(|h| h.checked_add(add_factor)),
72+
)
73+
}
74+
}
75+
76+
impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for RecomposeHangul<I> {}
77+
78+
impl<I: Iterator<Item = char>> RecomposeHangul<I> {
79+
#[inline]
80+
pub(crate) fn new(mut iter: I) -> Self {
81+
RecomposeHangul {
82+
v: None,
83+
last: iter.next(),
84+
inner: iter,
85+
}
86+
}
87+
}
88+
89+
// B.2.1
90+
91+
static CP_JAMO: [char; 94] = [
92+
'\u{1100}', '\u{1101}', '\u{11AA}', '\u{1102}', '\u{11AC}', '\u{11AD}', '\u{1103}', '\u{1104}',
93+
'\u{1105}', '\u{11B0}', '\u{11B1}', '\u{11B2}', '\u{11B3}', '\u{11B4}', '\u{11B5}', '\u{111A}',
94+
'\u{1106}', '\u{1107}', '\u{1108}', '\u{1121}', '\u{1109}', '\u{110A}', '\u{110B}', '\u{110C}',
95+
'\u{110D}', '\u{110E}', '\u{110F}', '\u{1110}', '\u{1111}', '\u{1112}', '\u{1161}', '\u{1162}',
96+
'\u{1163}', '\u{1164}', '\u{1165}', '\u{1166}', '\u{1167}', '\u{1168}', '\u{1169}', '\u{116A}',
97+
'\u{116B}', '\u{116C}', '\u{116D}', '\u{116E}', '\u{116F}', '\u{1170}', '\u{1171}', '\u{1172}',
98+
'\u{1173}', '\u{1174}', '\u{1175}', '\u{1160}', '\u{1114}', '\u{1115}', '\u{11C7}', '\u{11C8}',
99+
'\u{11CC}', '\u{11CE}', '\u{11D3}', '\u{11D7}', '\u{11D9}', '\u{111C}', '\u{11DD}', '\u{11DF}',
100+
'\u{111D}', '\u{111E}', '\u{1120}', '\u{1122}', '\u{1123}', '\u{1127}', '\u{1129}', '\u{112B}',
101+
'\u{112C}', '\u{112D}', '\u{112E}', '\u{112F}', '\u{1132}', '\u{1136}', '\u{1140}', '\u{1147}',
102+
'\u{114C}', '\u{11F1}', '\u{11F2}', '\u{1157}', '\u{1158}', '\u{1159}', '\u{1184}', '\u{1185}',
103+
'\u{1188}', '\u{1191}', '\u{1192}', '\u{1194}', '\u{119E}', '\u{11A1}',
104+
];
105+
106+
// § B.2.2
107+
108+
static HW_JAMO: [char; 64] = [
109+
'\u{1160}', '\u{1100}', '\u{1101}', '\u{11AA}', '\u{1102}', '\u{11AC}', '\u{11AD}', '\u{1103}',
110+
'\u{1104}', '\u{1105}', '\u{11B0}', '\u{11B1}', '\u{11B2}', '\u{11B3}', '\u{11B4}', '\u{11B5}',
111+
'\u{111A}', '\u{1106}', '\u{1107}', '\u{1108}', '\u{1121}', '\u{1109}', '\u{110A}', '\u{110B}',
112+
'\u{110C}', '\u{110D}', '\u{110E}', '\u{110F}', '\u{1110}', '\u{1111}', '\u{1112}', '\u{FFBF}',
113+
'\u{FFC0}', '\u{FFC1}', '\u{1161}', '\u{1162}', '\u{1163}', '\u{1164}', '\u{1165}', '\u{1166}',
114+
'\u{FFC8}', '\u{FFC9}', '\u{1167}', '\u{1168}', '\u{1169}', '\u{116A}', '\u{116B}', '\u{116C}',
115+
'\u{FFD0}', '\u{FFD1}', '\u{116D}', '\u{116E}', '\u{116F}', '\u{1170}', '\u{1171}', '\u{1172}',
116+
'\u{FFD8}', '\u{FFD9}', '\u{1173}', '\u{1174}', '\u{1175}', '\u{FFDD}', '\u{FFDE}', '\u{FFDF}',
117+
];
118+
119+
// § B.2.3
120+
121+
static PC_JAMO: [char; 14] = [
122+
'\u{1100}', '\u{1102}', '\u{1103}', '\u{1105}', '\u{1106}', '\u{1107}', '\u{1109}', '\u{110B}',
123+
'\u{110C}', '\u{110E}', '\u{110F}', '\u{1110}', '\u{1111}', '\u{1112}',
124+
];
125+
126+
// § B.2.4
127+
128+
/// Iterator that decomposes compatibility characters containing Hangul jamo
129+
/// in a manner that avoids introducing new nonstandard jamo sequences,
130+
/// as specified in KS X 1026-1 annex B.2.4.
131+
#[derive(Clone, Debug)]
132+
pub struct NormalizeJamoKdkc<I> {
133+
inner: I,
134+
// Buffer for when a character normalizes into multiple.
135+
// Characters are pushed to and popped from the end.
136+
// Length 3 is sufficient, as the longest possible expansion
137+
// is for a parenthesized choseong like U+3200,
138+
// which expands into ['(', <choseong>, '\u{1160}', ')'] (length 4).
139+
// (There are no parenthesized jungseong or jongseong.)
140+
buf: ArrayVec<[char; 3]>,
141+
}
142+
143+
impl<I: Iterator<Item = char>> Iterator for NormalizeJamoKdkc<I> {
144+
type Item = char;
145+
146+
fn next(&mut self) -> Option<Self::Item> {
147+
if let Some(c) = self.buf.pop() {
148+
// Empty buffer before yielding from underlying iterator.
149+
Some(c)
150+
} else {
151+
let ch = self.inner.next()?;
152+
// Whether ch is a parenthesized Hangul letter
153+
let mut pf = false;
154+
155+
let uch: u32 = ch.into();
156+
let base_jamo: char = match uch {
157+
// Hangul compatibility letter
158+
0x3131..=0x318E => CP_JAMO[usize::try_from(uch - 0x3131).unwrap()],
159+
160+
// Parenthesized Hangul letter
161+
0x3200..=0x320D => {
162+
pf = true;
163+
self.buf.push(')');
164+
PC_JAMO[usize::try_from(uch - 0x3200).unwrap()]
165+
}
166+
167+
// Circled Hangul letter
168+
0x3260..=0x326D => PC_JAMO[usize::try_from(uch - 0x3260).unwrap()],
169+
170+
// Halfwidth Hangul letter
171+
0xFFA0..=0xFFDF => HW_JAMO[usize::try_from(uch - 0xFFA0).unwrap()],
172+
173+
_ => return Some(ch),
174+
};
175+
176+
// Insert fillers
177+
let first_ret: char = match base_jamo.into() {
178+
// `base_jamo` is choseong, yield a jungseong filler after
179+
L_BASE..=L_LAST => {
180+
self.buf.push('\u{1160}');
181+
base_jamo
182+
}
183+
184+
// `base_jamo` is jungseong, yield a choseong filler before
185+
V_BASE..=V_LAST => {
186+
self.buf.push(base_jamo);
187+
'\u{115F}'
188+
}
189+
190+
// `base_jamo` is jongseong, yield a choseong and a jungseong filler before
191+
T_BASE..=T_LAST => {
192+
self.buf.push(base_jamo);
193+
self.buf.push('\u{1160}');
194+
'\u{115F}'
195+
}
196+
197+
_ => unreachable!("`base_jamo` shluld be a jamo, but is not"),
198+
};
199+
200+
if pf {
201+
// Parenthesized Hangul letter, yield open paren before
202+
self.buf.push(first_ret);
203+
Some('(')
204+
} else {
205+
Some(first_ret)
206+
}
207+
}
208+
}
209+
210+
#[inline]
211+
fn size_hint(&self) -> (usize, Option<usize>) {
212+
let (inner_lo, inner_hi) = self.inner.size_hint();
213+
let add_factor: usize = self.buf.len();
214+
(
215+
inner_lo.saturating_add(add_factor),
216+
inner_hi
217+
.and_then(|h| h.checked_mul(4)) // Why 4? See comment on `buf` field
218+
.and_then(|h| h.checked_add(add_factor)),
219+
)
220+
}
221+
}
222+
223+
impl<I: Iterator<Item = char> + FusedIterator> FusedIterator for NormalizeJamoKdkc<I> {}
224+
225+
impl<I: Iterator<Item = char>> NormalizeJamoKdkc<I> {
226+
#[inline]
227+
pub(crate) fn new(iter: I) -> Self {
228+
NormalizeJamoKdkc {
229+
inner: iter,
230+
buf: ArrayVec::new(),
231+
}
232+
}
233+
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy