Skip to content

Commit f551255

Browse files
committed
Add support for jamo and reserved code points.
1 parent e07b437 commit f551255

File tree

7 files changed

+365
-67
lines changed

7 files changed

+365
-67
lines changed

Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
[package]
22
name = "unicode-charname"
33
version = "0.1.0"
4-
authors = ["CrLF0710"]
4+
authors = ["Charles Lew <crlf0710@gmail.com>"]
55
edition = "2018"
6-
6+
license = "MIT/Apache-2.0"
77
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
88

99
[dependencies]

scripts/unicode.py

Lines changed: 80 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616
# Since this should not require frequent updates, we just store this
1717
# out-of-line and check the unicode.rs file into git.
1818

19-
import fileinput, re, os, sys
19+
import fileinput
20+
import re
21+
import os
22+
import sys
2023

2124
preamble = '''// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
2225
// file at the top-level directory of this distribution and at
@@ -36,11 +39,13 @@
3639

3740
UNICODE_VERSION = (13, 0, 0)
3841

39-
UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION
42+
UNICODE_VERSION_NUMBER = "%s.%s.%s" % UNICODE_VERSION
43+
4044

4145
def escape_char(c):
4246
return "'\\u{%x}'" % c
4347

48+
4449
def fetch(f):
4550
if not os.path.exists(os.path.basename(f)):
4651
if "emoji" in f:
@@ -55,6 +60,8 @@ def fetch(f):
5560
exit(1)
5661

5762
# Implementation from unicode-segmentation
63+
64+
5865
def load_names(f, interestingprops):
5966
fetch(f)
6067
normal_names = {}
@@ -76,17 +83,21 @@ def load_names(f, interestingprops):
7683
normal_names[d_ch] = d_name
7784
return (normal_names, special_names)
7885

86+
7987
SPACE_SYMBOL = ' '
8088
CODEPOINT_SYMBOL = '@'
8189
SPECIAL_SYMBOLS = ['-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
8290

91+
8392
def tokenize(str, codepoint):
8493
name_sep = str
8594
name_sep = name_sep.replace(codepoint, CODEPOINT_SYMBOL)
8695
for symbol in SPECIAL_SYMBOLS:
87-
name_sep = name_sep.replace(symbol, SPACE_SYMBOL + symbol + SPACE_SYMBOL)
96+
name_sep = name_sep.replace(
97+
symbol, SPACE_SYMBOL + symbol + SPACE_SYMBOL)
8898
return name_sep
8999

100+
90101
def make_wordset(names):
91102
word_set = {}
92103
word_set[SPACE_SYMBOL] = SPACE_SYMBOL
@@ -97,6 +108,7 @@ def make_wordset(names):
97108
word_set[word] = word
98109
return word_set
99110

111+
100112
class WordIndex:
101113
def __init__(self, normal_names):
102114
word_set = make_wordset(normal_names)
@@ -123,7 +135,7 @@ def __init__(self, normal_names):
123135
self.word_map = word_map
124136
self.special_map = special_map
125137
self.special_list = special_list
126-
138+
127139
def encode(self, name, codepoint):
128140
name_sep = tokenize(name, codepoint)
129141
word_list = name_sep.split(SPACE_SYMBOL)
@@ -143,16 +155,18 @@ def encode(self, name, codepoint):
143155
encoded_sequence.append(self.word_map[SPACE_SYMBOL])
144156
name_build += SPACE_SYMBOL
145157
if not name[len(name_build):].startswith(word):
146-
raise Exception("Divergence! [%s] - [%s] vs [%s]" % (name, name[len(name_build):], word))
158+
raise Exception(
159+
"Divergence! [%s] - [%s] vs [%s]" % (name, name[len(name_build):], word))
147160
name_build += word
148161
encoded_sequence.append(word_idx)
149162
# encoded_sequence.append(word)
150163
last_is_special = word_is_special
151164
if name_build != name:
152165
raise Exception("Different! [%s] vs [%s]" % (name, name_build))
153-
166+
154167
return encoded_sequence
155168

169+
156170
def create_intervals(list):
157171
list.sort()
158172
in_group = False
@@ -167,8 +181,10 @@ def create_intervals(list):
167181
in_group = False
168182
return result
169183

184+
170185
def create_normal_groups(normal_names):
171-
normal_intervals = create_intervals([int(key, 16) for key in normal_names.keys()])
186+
normal_intervals = create_intervals(
187+
[int(key, 16) for key in normal_names.keys()])
172188
encoded_groups = []
173189
for first, last in normal_intervals:
174190
group_buffer = []
@@ -184,6 +200,7 @@ def create_normal_groups(normal_names):
184200
encoded_groups.append((first, last, group_buffer, pos_buffer))
185201
return encoded_groups
186202

203+
187204
def create_special_groups(special_names):
188205
item_idx = 0
189206
item_count = len(special_names)
@@ -198,9 +215,11 @@ def create_special_groups(special_names):
198215
label = m1.group(1)
199216
m2 = re2.match(special_names[item_idx + 1][1])
200217
if not m2 or m2.group(1) != label:
201-
raise Exception("Pair mismatch! [%s] vs [%s]" % (special_names[item_idx], special_names[item_idx + 1]))
202-
203-
result.append((int(special_names[item_idx][0], 16), int(special_names[item_idx + 1][0], 16), label))
218+
raise Exception("Pair mismatch! [%s] vs [%s]" % (
219+
special_names[item_idx], special_names[item_idx + 1]))
220+
221+
result.append((int(special_names[item_idx][0], 16), int(
222+
special_names[item_idx + 1][0], 16), label))
204223
item_idx += 2
205224
continue
206225
m3 = re3.match(item_text)
@@ -211,25 +230,30 @@ def create_special_groups(special_names):
211230
while try_item_idx < item_count and special_names[try_item_idx][1] == item_text:
212231
last_repeat_item_idx = try_item_idx
213232
try_item_idx += 1
214-
result.append((int(special_names[item_idx][0], 16), int(special_names[last_repeat_item_idx][0], 16), label))
233+
result.append((int(special_names[item_idx][0], 16), int(
234+
special_names[last_repeat_item_idx][0], 16), label))
215235
item_idx = last_repeat_item_idx + 1
216236
continue
217-
237+
218238
raise Exception("Unexpected item: %s" % item_text)
219239
return result
220240

241+
221242
def write_enumeration_char_names(rf, encoded_groups):
222243
rf.write("""
223244
pub const ENUMERATION_CHAR_NAMES: &'static [(u32, u32, &'static [u16], &'static [u32])] = &[
224245
""")
225246
for (first, last, group_buffer, pos_buffer) in encoded_groups:
226-
rf.write("\t(%d, %d, &%s, &%s),\n" % (first, last, group_buffer, pos_buffer))
247+
rf.write("\t(%d, %d, &%s, &%s),\n" %
248+
(first, last, group_buffer, pos_buffer))
227249
rf.write("""];
228250
""")
229251

252+
230253
def write_special_groups(rf, special_groups):
231254
rf.write("""
232255
#[allow(non_camel_case_types)]
256+
#[derive(Copy, Clone, PartialEq, Eq)]
233257
pub enum SpecialGroup {
234258
""")
235259
for ((_, _, groupname)) in special_groups:
@@ -249,8 +273,27 @@ def write_special_groups(rf, special_groups):
249273
if (idx + 1) % 2 == 0:
250274
rf.write('\n')
251275
rf.write("""];
276+
277+
pub fn find_in_special_groups(ch: u32) -> Option<SpecialGroup> {
278+
let record_idx = SPECIAL_GROUPS
279+
.binary_search_by(|record| {
280+
use std::cmp::Ordering;
281+
if record.1 < ch {
282+
Ordering::Less
283+
} else if record.0 > ch {
284+
Ordering::Greater
285+
} else {
286+
Ordering::Equal
287+
}
288+
})
289+
.ok()?;
290+
let group = SPECIAL_GROUPS[record_idx].2;
291+
Some(group)
292+
}
293+
252294
""")
253295

296+
254297
def write_word_table(rf, word_table):
255298
rf.write("""
256299
pub const ENUMERATION_WORD_TABLE: &'static [&'static str] = &[
@@ -262,8 +305,29 @@ def write_word_table(rf, word_table):
262305
if (idx + 1) % 8 == 0:
263306
rf.write('\n')
264307
rf.write("""];
308+
309+
pub fn find_in_enumerate_names(ch: u32) -> Option<&'static [u16]> {
310+
let record_idx = ENUMERATION_CHAR_NAMES
311+
.binary_search_by(|record| {
312+
use std::cmp::Ordering;
313+
if record.1 < ch {
314+
Ordering::Less
315+
} else if record.0 > ch {
316+
Ordering::Greater
317+
} else {
318+
Ordering::Equal
319+
}
320+
})
321+
.ok()?;
322+
let offset = (ch - ENUMERATION_CHAR_NAMES[record_idx].0) as usize;
323+
let index_slice = ENUMERATION_CHAR_NAMES[record_idx].2;
324+
let offset_slice = ENUMERATION_CHAR_NAMES[record_idx].3;
325+
let range = (offset_slice[offset] as usize)..(offset_slice[offset + 1] as usize);
326+
Some(&index_slice[range])
327+
}
265328
""")
266329

330+
267331
def write_special_symbols(rf, word_index):
268332
rf.write("""
269333
pub const WORD_TABLE_INDEX_SPACE: u16 = %d;
@@ -279,12 +343,12 @@ def write_special_symbols(rf, word_index):
279343
""")
280344
for (first, last) in special_intervals:
281345
rf.write("\t\t%d..=%d => true,\n" % (first, last))
282-
rf.write("""
283-
_ => false,
346+
rf.write("""\t\t_ => false,
284347
}
285348
}
286349
""")
287350

351+
288352
if __name__ == "__main__":
289353
r = "tables.rs"
290354
if os.path.exists(r):
@@ -302,9 +366,8 @@ def write_special_symbols(rf, word_index):
302366
word_index = WordIndex(normal_names)
303367
normal_encoded_groups = create_normal_groups(normal_names)
304368
special_groups = create_special_groups(special_names)
305-
369+
306370
write_enumeration_char_names(rf, normal_encoded_groups)
307371
write_special_groups(rf, special_groups)
308372
write_word_table(rf, word_index.word_list)
309373
write_special_symbols(rf, word_index)
310-

src/jamo.rs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#![allow(dead_code)]
2+
// This is adapted from Unicode 13.0, 3.12.
3+
4+
const S_BASE: u32 = 0xAC00;
5+
const L_BASE: u32 = 0x1100;
6+
const V_BASE: u32 = 0x1161;
7+
const T_BASE: u32 = 0x11A7;
8+
const L_COUNT: u32 = 19;
9+
const V_COUNT: u32 = 21;
10+
const T_COUNT: u32 = 28;
11+
const N_COUNT: u32 = V_COUNT * T_COUNT; // 588
12+
const S_COUNT: u32 = L_COUNT * N_COUNT; // 11172
13+
14+
const JAMO_L_TABLE: &[&'static str] = &[
15+
"G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", "J", "JJ", "C", "K", "T", "P",
16+
"H",
17+
];
18+
19+
const JAMO_V_TABLE: &[&'static str] = &[
20+
"A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", "OE", "YO", "U", "WEO", "WE",
21+
"WI", "YU", "EU", "YI", "I",
22+
];
23+
24+
const JAMO_T_TABLE: &[&'static str] = &[
25+
"", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB", "LS", "LT", "LP", "LH", "M",
26+
"B", "BS", "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
27+
];
28+
29+
pub(crate) fn hangul_name(s: u32) -> String {
30+
let s_index = s - S_BASE;
31+
assert!(s_index < S_COUNT);
32+
let l_index = s_index / N_COUNT;
33+
let v_index = (s_index % N_COUNT) / T_COUNT;
34+
let t_index = s_index % T_COUNT;
35+
format!(
36+
"HANGUL SYLLABLE {}{}{}",
37+
JAMO_L_TABLE[l_index as usize],
38+
JAMO_V_TABLE[v_index as usize],
39+
JAMO_T_TABLE[t_index as usize]
40+
)
41+
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy