Skip to content

Commit 8a26b3e

Browse files
Fix #125
1 parent 596e886 commit 8a26b3e

File tree

6 files changed

+352
-188
lines changed

6 files changed

+352
-188
lines changed

scripts/unicode.py

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -155,11 +155,11 @@ def format_table_content(f, content, indent):
155155
line = " "*indent + chunk
156156
f.write(line)
157157

158-
def load_properties(f, interestingprops):
158+
def load_properties(f, interestingprops: "list[str | tuple[str, str]] | None" = None):
159159
fetch(f)
160160
props = {}
161-
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
162-
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
161+
re1 = re.compile(r"^\s*([0-9A-F]+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
162+
re2 = re.compile(r"^\s*([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
163163

164164
for line in fileinput.input(os.path.basename(f)):
165165
prop = None
@@ -168,17 +168,21 @@ def load_properties(f, interestingprops):
168168
m = re1.match(line)
169169
if m:
170170
d_lo = m.group(1)
171-
d_hi = m.group(1)
171+
d_hi = d_lo
172172
prop = m.group(2)
173+
value = m.group(3)
173174
else:
174175
m = re2.match(line)
175176
if m:
176177
d_lo = m.group(1)
177178
d_hi = m.group(2)
178179
prop = m.group(3)
180+
value = m.group(4)
179181
else:
180182
continue
181-
if interestingprops and prop not in interestingprops:
183+
if value is not None:
184+
prop = (prop, value)
185+
if interestingprops is not None and prop not in interestingprops:
182186
continue
183187
d_lo = int(d_lo, 16)
184188
d_hi = int(d_hi, 16)
@@ -195,7 +199,7 @@ def load_properties(f, interestingprops):
195199
def escape_char(c):
196200
return "'\\u{%x}'" % c
197201

198-
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
202+
def emit_table(f, name, t_data, t_type = "&[(char, char)]", is_pub=True,
199203
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
200204
pub_string = "const"
201205
if not is_const:
@@ -217,7 +221,7 @@ def emit_util_mod(f):
217221
f.write("""
218222
pub mod util {
219223
#[inline]
220-
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
224+
pub fn bsearch_range_table(c: char, r: &[(char,char)]) -> bool {
221225
use core::cmp::Ordering::{Equal, Less, Greater};
222226
r.binary_search_by(|&(lo,hi)| {
223227
if lo <= c && c <= hi { Equal }
@@ -252,13 +256,22 @@ def emit_util_mod(f):
252256
253257
""")
254258

255-
def emit_property_module(f, mod, tbl, emit):
256-
f.write("mod %s {\n" % mod)
257-
for cat in sorted(emit):
258-
emit_table(f, "%s_table" % cat, tbl[cat], is_pub=False)
259+
def emit_property_module(f, mod, tbl, emit: "list[str | tuple[str, str]]"):
260+
f.write("pub mod %s {\n" % mod)
261+
262+
cats = []
263+
for cat in emit:
264+
if type(cat) is tuple:
265+
cats.append((f"{cat[0]}_{cat[1]}", cat))
266+
else:
267+
cats.append((cat, cat))
268+
cats.sort(key=lambda x: x[0])
269+
270+
for cat_str, cat in cats:
271+
emit_table(f, "%s_table" % cat_str, tbl[cat], is_pub=False)
259272
f.write(" #[inline]\n")
260-
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
261-
f.write(" super::util::bsearch_range_table(c, %s_table)\n" % cat)
273+
f.write(" pub fn %s(c: char) -> bool {\n" % cat_str)
274+
f.write(" super::util::bsearch_range_table(c, %s_table)\n" % cat_str)
262275
f.write(" }\n\n")
263276
f.write("}\n\n")
264277

@@ -303,7 +316,7 @@ def emit_break_module(f, break_table, break_cats, name):
303316
f.write((" %sC_" % Name[0]) + cat + ",\n")
304317
f.write(""" }
305318
306-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
319+
fn bsearch_range_value_table(c: char, r: &[(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
307320
use core::cmp::Ordering::{Equal, Less, Greater};
308321
match r.binary_search_by(|&(lo, hi, _)| {
309322
if lo <= c && c <= hi { Equal }
@@ -355,11 +368,11 @@ def emit_break_module(f, break_table, break_cats, name):
355368
else:
356369
lookup_type = "u32"
357370

358-
emit_table(f, "%s_cat_lookup" % name, lookup_table, "&'static [%s]" % lookup_type,
371+
emit_table(f, "%s_cat_lookup" % name, lookup_table, "&[%s]" % lookup_type,
359372
pfun=lambda x: "%d" % x,
360373
is_pub=False, is_const=True)
361374

362-
emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
375+
emit_table(f, "%s_cat_table" % name, break_table, "&[(char, char, %sCat)]" % Name,
363376
pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),
364377
is_pub=False, is_const=True)
365378
f.write("}\n")
@@ -379,17 +392,26 @@ def emit_break_module(f, break_table, break_cats, name):
379392

380393
# download and parse all the data
381394
gencats = load_gencats("UnicodeData.txt")
382-
derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic"])
395+
derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic", ("InCB", "Consonant"), ("InCB", "Extend"), ("InCB", "Linker")])
383396

384397
emit_util_mod(rf)
385398
for (name, cat, pfuns) in ("general_category", gencats, ["N"]), \
386-
("derived_property", derived, ["Alphabetic"]):
399+
("derived_property", derived, ["Alphabetic", ("InCB", "Extend")]):
387400
emit_property_module(rf, name, cat, pfuns)
388401

402+
rf.write("""pub fn is_incb_linker(c: char) -> bool {
403+
matches!(c,""")
404+
405+
for (lo, hi) in derived[("InCB", "Linker")]:
406+
rf.write(f" | '\\u{{{lo:X}}}'")
407+
if lo != hi:
408+
rf.write(f"..'\\u{{{lo:X}}}'")
409+
410+
rf.write(")\n}\n\n")
411+
389412
### grapheme cluster module
390413
# from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
391-
grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt", [])
392-
414+
grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt")
393415
# Control
394416
# Note:
395417
# This category also includes Cs (surrogate codepoints), but Rust's `char`s are
@@ -398,22 +420,22 @@ def emit_break_module(f, break_table, break_cats, name):
398420
grapheme_cats["Control"] = group_cat(list(
399421
set(ungroup_cat(grapheme_cats["Control"]))
400422
- set(ungroup_cat([surrogate_codepoints]))))
401-
423+
grapheme_cats["InCB_Consonant"] = derived[("InCB", "Consonant")]
424+
emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
425+
grapheme_cats["Extended_Pictographic"] = emoji_props["Extended_Pictographic"]
402426
grapheme_table = []
403427
for cat in grapheme_cats:
404428
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
405-
emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
406-
grapheme_table.extend([(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]])
407429
grapheme_table.sort(key=lambda w: w[0])
408430
last = -1
409431
for chars in grapheme_table:
410432
if chars[0] <= last:
411433
raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
412434
last = chars[1]
413-
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()) + ["Extended_Pictographic"], "grapheme")
435+
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme")
414436
rf.write("\n")
415437

416-
word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
438+
word_cats = load_properties("auxiliary/WordBreakProperty.txt")
417439
word_table = []
418440
for cat in word_cats:
419441
word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
@@ -425,7 +447,7 @@ def emit_break_module(f, break_table, break_cats, name):
425447
emoji_table = [(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]]
426448
emit_break_module(rf, emoji_table, ["Extended_Pictographic"], "emoji")
427449

428-
sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
450+
sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt")
429451
sentence_table = []
430452
for cat in sentence_cats:
431453
sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])

scripts/unicode_gen_breaktests.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,8 @@ def showfun(x):
140140
return outstr
141141

142142
def create_grapheme_data(f):
143-
# rules 9.1 and 9.2 are for extended graphemes only
144-
optsplits = ['9.1','9.2']
143+
# rules 9.1, 9.2, and 9.3 are for extended graphemes only
144+
optsplits = ['9.1', '9.2', '9.3']
145145
d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)
146146

147147
test_same = []

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy