Skip to content

Commit 592ce00

Browse files
authored
Merge pull request #134 from Jules-Bertholet/fix
Fix #125
2 parents 3ff9de6 + dce3a34 commit 592ce00

File tree

11 files changed

+1271
-2069
lines changed

11 files changed

+1271
-2069
lines changed

.github/workflows/rust.yml

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,28 +7,31 @@ on:
77
branches: [ master ]
88

99
env:
10+
CARGO_INCREMENTAL: 0
1011
CARGO_TERM_COLOR: always
12+
RUST_BACKTRACE: 1
13+
RUSTFLAGS: -D warnings
14+
RUSTDOCFLAGS: -D warnings
1115

1216
jobs:
1317
build:
14-
1518
runs-on: ubuntu-latest
16-
1719
steps:
1820
- uses: actions/checkout@v2
1921
- name: Build
2022
run: cargo build --verbose
2123
- name: Run tests
2224
run: cargo test --verbose
23-
fmt:
25+
- name: Run clippy
26+
run: cargo clippy --all-targets --all --verbose
2427

28+
fmt:
2529
runs-on: ubuntu-latest
26-
2730
steps:
2831
- uses: actions/checkout@v2
2932
- name: Rustfmt
30-
run: cargo fmt --check
33+
run: cargo fmt --all --check
3134
- name: Verify regenerated files
3235
run: ./scripts/unicode.py && diff tables.rs src/tables.rs
3336
- name: Verify regenerated tests
34-
run: ./scripts/unicode_gen_breaktests.py && rustfmt testdata.rs && diff testdata.rs src/testdata.rs
37+
run: ./scripts/unicode_gen_breaktests.py && diff testdata.rs tests/testdata/mod.rs

benches/chars.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
//! is how much slower full unicode handling is.
77
88
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
9-
use unicode_segmentation;
109

1110
use std::fs;
1211
use unicode_segmentation::UnicodeSegmentation;
@@ -24,14 +23,14 @@ const FILES: &[&str] = &[
2423

2524
#[inline(always)]
2625
fn grapheme(text: &str) {
27-
for c in UnicodeSegmentation::graphemes(black_box(&*text), true) {
26+
for c in UnicodeSegmentation::graphemes(black_box(text), true) {
2827
black_box(c);
2928
}
3029
}
3130

3231
#[inline(always)]
3332
fn scalar(text: &str) {
34-
for c in black_box(&*text).chars() {
33+
for c in black_box(text).chars() {
3534
black_box(c);
3635
}
3736
}

scripts/unicode.py

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -155,11 +155,11 @@ def format_table_content(f, content, indent):
155155
line = " "*indent + chunk
156156
f.write(line)
157157

158-
def load_properties(f, interestingprops):
158+
def load_properties(f, interestingprops: "list[str | tuple[str, str]] | None" = None):
159159
fetch(f)
160160
props = {}
161-
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
162-
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
161+
re1 = re.compile(r"^\s*([0-9A-F]+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
162+
re2 = re.compile(r"^\s*([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(\w+)(?:\s*;\s*(\w+))?")
163163

164164
for line in fileinput.input(os.path.basename(f)):
165165
prop = None
@@ -168,17 +168,21 @@ def load_properties(f, interestingprops):
168168
m = re1.match(line)
169169
if m:
170170
d_lo = m.group(1)
171-
d_hi = m.group(1)
171+
d_hi = d_lo
172172
prop = m.group(2)
173+
value = m.group(3)
173174
else:
174175
m = re2.match(line)
175176
if m:
176177
d_lo = m.group(1)
177178
d_hi = m.group(2)
178179
prop = m.group(3)
180+
value = m.group(4)
179181
else:
180182
continue
181-
if interestingprops and prop not in interestingprops:
183+
if value is not None:
184+
prop = (prop, value)
185+
if interestingprops is not None and prop not in interestingprops:
182186
continue
183187
d_lo = int(d_lo, 16)
184188
d_hi = int(d_hi, 16)
@@ -195,7 +199,7 @@ def load_properties(f, interestingprops):
195199
def escape_char(c):
196200
return "'\\u{%x}'" % c
197201

198-
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
202+
def emit_table(f, name, t_data, t_type = "&[(char, char)]", is_pub=True,
199203
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
200204
pub_string = "const"
201205
if not is_const:
@@ -217,7 +221,7 @@ def emit_util_mod(f):
217221
f.write("""
218222
pub mod util {
219223
#[inline]
220-
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
224+
pub fn bsearch_range_table(c: char, r: &[(char,char)]) -> bool {
221225
use core::cmp::Ordering::{Equal, Less, Greater};
222226
r.binary_search_by(|&(lo,hi)| {
223227
if lo <= c && c <= hi { Equal }
@@ -252,13 +256,22 @@ def emit_util_mod(f):
252256
253257
""")
254258

255-
def emit_property_module(f, mod, tbl, emit):
256-
f.write("mod %s {\n" % mod)
257-
for cat in sorted(emit):
258-
emit_table(f, "%s_table" % cat, tbl[cat], is_pub=False)
259+
def emit_property_module(f, mod, tbl, emit: "list[str | tuple[str, str]]"):
260+
f.write("pub mod %s {\n" % mod)
261+
262+
cats = []
263+
for cat in emit:
264+
if type(cat) is tuple:
265+
cats.append((f"{cat[0]}_{cat[1]}", cat))
266+
else:
267+
cats.append((cat, cat))
268+
cats.sort(key=lambda x: x[0])
269+
270+
for cat_str, cat in cats:
271+
emit_table(f, "%s_table" % cat_str, tbl[cat], is_pub=False)
259272
f.write(" #[inline]\n")
260-
f.write(" pub fn %s(c: char) -> bool {\n" % cat)
261-
f.write(" super::util::bsearch_range_table(c, %s_table)\n" % cat)
273+
f.write(" pub fn %s(c: char) -> bool {\n" % cat_str)
274+
f.write(" super::util::bsearch_range_table(c, %s_table)\n" % cat_str)
262275
f.write(" }\n\n")
263276
f.write("}\n\n")
264277

@@ -303,7 +316,7 @@ def emit_break_module(f, break_table, break_cats, name):
303316
f.write((" %sC_" % Name[0]) + cat + ",\n")
304317
f.write(""" }
305318
306-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
319+
fn bsearch_range_value_table(c: char, r: &[(char, char, %sCat)], default_lower: u32, default_upper: u32) -> (u32, u32, %sCat) {
307320
use core::cmp::Ordering::{Equal, Less, Greater};
308321
match r.binary_search_by(|&(lo, hi, _)| {
309322
if lo <= c && c <= hi { Equal }
@@ -355,11 +368,11 @@ def emit_break_module(f, break_table, break_cats, name):
355368
else:
356369
lookup_type = "u32"
357370

358-
emit_table(f, "%s_cat_lookup" % name, lookup_table, "&'static [%s]" % lookup_type,
371+
emit_table(f, "%s_cat_lookup" % name, lookup_table, "&[%s]" % lookup_type,
359372
pfun=lambda x: "%d" % x,
360373
is_pub=False, is_const=True)
361374

362-
emit_table(f, "%s_cat_table" % name, break_table, "&'static [(char, char, %sCat)]" % Name,
375+
emit_table(f, "%s_cat_table" % name, break_table, "&[(char, char, %sCat)]" % Name,
363376
pfun=lambda x: "(%s,%s,%sC_%s)" % (escape_char(x[0]), escape_char(x[1]), Name[0], x[2]),
364377
is_pub=False, is_const=True)
365378
f.write("}\n")
@@ -379,17 +392,26 @@ def emit_break_module(f, break_table, break_cats, name):
379392

380393
# download and parse all the data
381394
gencats = load_gencats("UnicodeData.txt")
382-
derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic"])
395+
derived = load_properties("DerivedCoreProperties.txt", ["Alphabetic", ("InCB", "Consonant"), ("InCB", "Extend"), ("InCB", "Linker")])
383396

384397
emit_util_mod(rf)
385398
for (name, cat, pfuns) in ("general_category", gencats, ["N"]), \
386-
("derived_property", derived, ["Alphabetic"]):
399+
("derived_property", derived, ["Alphabetic", ("InCB", "Extend")]):
387400
emit_property_module(rf, name, cat, pfuns)
388401

402+
rf.write("""pub fn is_incb_linker(c: char) -> bool {
403+
matches!(c,""")
404+
405+
for (lo, hi) in derived[("InCB", "Linker")]:
406+
rf.write(f" | '\\u{{{lo:X}}}'")
407+
if lo != hi:
408+
rf.write(f"..'\\u{{{lo:X}}}'")
409+
410+
rf.write(")\n}\n\n")
411+
389412
### grapheme cluster module
390413
# from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
391-
grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt", [])
392-
414+
grapheme_cats = load_properties("auxiliary/GraphemeBreakProperty.txt")
393415
# Control
394416
# Note:
395417
# This category also includes Cs (surrogate codepoints), but Rust's `char`s are
@@ -398,22 +420,22 @@ def emit_break_module(f, break_table, break_cats, name):
398420
grapheme_cats["Control"] = group_cat(list(
399421
set(ungroup_cat(grapheme_cats["Control"]))
400422
- set(ungroup_cat([surrogate_codepoints]))))
401-
423+
grapheme_cats["InCB_Consonant"] = derived[("InCB", "Consonant")]
424+
emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
425+
grapheme_cats["Extended_Pictographic"] = emoji_props["Extended_Pictographic"]
402426
grapheme_table = []
403427
for cat in grapheme_cats:
404428
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
405-
emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
406-
grapheme_table.extend([(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]])
407429
grapheme_table.sort(key=lambda w: w[0])
408430
last = -1
409431
for chars in grapheme_table:
410432
if chars[0] <= last:
411433
raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
412434
last = chars[1]
413-
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()) + ["Extended_Pictographic"], "grapheme")
435+
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme")
414436
rf.write("\n")
415437

416-
word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
438+
word_cats = load_properties("auxiliary/WordBreakProperty.txt")
417439
word_table = []
418440
for cat in word_cats:
419441
word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
@@ -425,7 +447,7 @@ def emit_break_module(f, break_table, break_cats, name):
425447
emoji_table = [(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]]
426448
emit_break_module(rf, emoji_table, ["Extended_Pictographic"], "emoji")
427449

428-
sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
450+
sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt")
429451
sentence_table = []
430452
for cat in sentence_cats:
431453
sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])

scripts/unicode_gen_breaktests.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,8 @@ def showfun(x):
140140
return outstr
141141

142142
def create_grapheme_data(f):
143-
# rules 9.1 and 9.2 are for extended graphemes only
144-
optsplits = ['9.1','9.2']
143+
# rules 9.1, 9.2, and 9.3 are for extended graphemes only
144+
optsplits = ['9.1', '9.2', '9.3']
145145
d = load_test_data("auxiliary/GraphemeBreakTest.txt", optsplits)
146146

147147
test_same = []
@@ -169,8 +169,8 @@ def create_grapheme_data(f):
169169
else:
170170
test_diff.append((allchars, extgraphs, c))
171171

172-
stype = "&'static [(&'static str, &'static [&'static str])]"
173-
dtype = "&'static [(&'static str, &'static [&'static str], &'static [&'static str])]"
172+
stype = "&[(&str, &[&str])]"
173+
dtype = "&[(&str, &[&str], &[&str])]"
174174
f.write(" // official Unicode test data\n")
175175
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/GraphemeBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
176176
unicode.emit_table(f, "TEST_SAME", test_same, stype, True, showfun, True)
@@ -185,7 +185,7 @@ def create_words_data(f):
185185
allchars = [cn for s in c for cn in s]
186186
test.append((allchars, c))
187187

188-
wtype = "&'static [(&'static str, &'static [&'static str])]"
188+
wtype = "&[(&str, &[&str])]"
189189
f.write(" // official Unicode test data\n")
190190
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/WordBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
191191
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
@@ -199,7 +199,7 @@ def create_sentence_data(f):
199199
allchars = [cn for s in c for cn in s]
200200
test.append((allchars, c))
201201

202-
wtype = "&'static [(&'static str, &'static [&'static str])]"
202+
wtype = "&[(&str, &[&str])]"
203203
f.write(" // official Unicode test data\n")
204204
f.write(" // http://www.unicode.org/Public/%s/ucd/auxiliary/SentenceBreakTest.txt\n" % unicode.UNICODE_VERSION_NUMBER)
205205
unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy