Skip to content

Commit eb9d304

Browse files
authored
Merge pull request #31 from ohhithere/fix-internal-skeleton
Fix internalSkeleton
2 parents 22d684a + 78707a7 commit eb9d304

File tree

4 files changed

+744
-902
lines changed

4 files changed

+744
-902
lines changed

scripts/unicode.py

Lines changed: 35 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
# - confusables.txt
1818
# - ReadMe.txt
1919
# This script also uses the following Unicode UCD data:
20+
# - DerivedCoreProperties.txt
2021
# - Scripts.txt
2122
#
2223
# Since this should not require frequent updates, we just store this
@@ -53,6 +54,8 @@ def fetch(f):
5354
sys.stderr.write("cannot load %s\n" % f)
5455
exit(1)
5556

57+
return f
58+
5659
# Download a UCD table file
5760
def fetch_unidata(f):
5861
if not os.path.exists(os.path.basename(f)):
@@ -63,14 +66,14 @@ def fetch_unidata(f):
6366
sys.stderr.write("cannot load %s" % f)
6467
exit(1)
6568

66-
# Loads code point data from IdentifierStatus.txt and
67-
# IdentifierType.txt
68-
# Implementation from unicode-segmentation
69+
return f
70+
71+
# Loads code point data from provided filename f
72+
# Implementation adapted from unicode-segmentation
6973
def load_properties(f, interestingprops = None):
70-
fetch(f)
7174
props = {}
72-
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
73-
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)")
75+
re1 = re.compile(r"^ *([0-9A-F]+) *; *([^#\s]+) *#")
76+
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#\s]+) *#")
7477

7578
for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
7679
prop = None
@@ -99,42 +102,6 @@ def load_properties(f, interestingprops = None):
99102

100103
return props
101104

102-
# Loads script data from Scripts.txt
103-
def load_script_properties(f, interestingprops):
104-
fetch_unidata(f)
105-
props = {}
106-
# Note: these regexes are different from those in unicode-segmentation,
107-
# becase we need to handle spaces here
108-
re1 = re.compile(r"^ *([0-9A-F]+) *; *([^#]+) *#")
109-
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#")
110-
111-
for line in fileinput.input(os.path.basename(f)):
112-
prop = None
113-
d_lo = 0
114-
d_hi = 0
115-
m = re1.match(line)
116-
if m:
117-
d_lo = m.group(1)
118-
d_hi = m.group(1)
119-
prop = m.group(2).strip()
120-
else:
121-
m = re2.match(line)
122-
if m:
123-
d_lo = m.group(1)
124-
d_hi = m.group(2)
125-
prop = m.group(3).strip()
126-
else:
127-
continue
128-
if interestingprops and prop not in interestingprops:
129-
continue
130-
d_lo = int(d_lo, 16)
131-
d_hi = int(d_hi, 16)
132-
if prop not in props:
133-
props[prop] = []
134-
props[prop].append((d_lo, d_hi))
135-
136-
return props
137-
138105
# Loads confusables data from confusables.txt
139106
def load_confusables(f):
140107
fetch(f)
@@ -189,7 +156,7 @@ def load_scripts(f):
189156
# changes are introduced, update accordingly.
190157

191158
(longforms, shortforms) = aliases()
192-
scripts = load_script_properties(f, [])
159+
scripts = load_properties(fetch_unidata(f), [])
193160

194161
script_table = []
195162
script_list = []
@@ -546,10 +513,10 @@ def emit_identifier_module(f):
546513
""")
547514

548515
f.write(" // Identifier status table:\n")
549-
identifier_status_table = load_properties("IdentifierStatus.txt")
516+
identifier_status_table = load_properties(fetch("IdentifierStatus.txt"))
550517
emit_table(f, "IDENTIFIER_STATUS", identifier_status_table['Allowed'], "&'static [(char, char)]", is_pub=False,
551518
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])))
552-
identifier_type = load_properties("IdentifierType.txt")
519+
identifier_type = load_properties(fetch("IdentifierType.txt"))
553520
type_table = []
554521
for ty in identifier_type:
555522
type_table.extend([(x, y, ty) for (x, y) in identifier_type[ty]])
@@ -560,6 +527,26 @@ def emit_identifier_module(f):
560527
pfun=lambda x: "(%s,%s, IdentifierType::%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
561528
f.write("}\n\n")
562529

530+
def emit_default_ignorable_detection_module(f):
531+
f.write("pub mod default_ignorable_code_point {")
532+
f.write("""
533+
534+
#[inline]
535+
pub fn default_ignorable_code_point(c: char) -> bool {
536+
match c as usize {
537+
_ => super::util::bsearch_range_table(c, DEFAULT_IGNORABLE)
538+
}
539+
}
540+
541+
""")
542+
543+
f.write(" // Default ignorable code point table:\n")
544+
default_ignorable_table = load_properties(fetch_unidata("DerivedCoreProperties.txt"), ["Default_Ignorable_Code_Point"])
545+
emit_table(f, "DEFAULT_IGNORABLE", default_ignorable_table["Default_Ignorable_Code_Point"], "&'static [(char, char)]", is_pub=False,
546+
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])))
547+
548+
f.write("}\n\n")
549+
563550
def emit_confusable_detection_module(f):
564551
f.write("pub mod confusable_detection {")
565552
f.write("""
@@ -601,7 +588,7 @@ def emit_potiential_mixed_script_confusable(f):
601588
}
602589
}
603590
""")
604-
identifier_status_table = load_properties("IdentifierStatus.txt")
591+
identifier_status_table = load_properties(fetch("IdentifierStatus.txt"))
605592
_, scripts = load_scripts("Scripts.txt")
606593
identifier_allowed = identifier_status_table['Allowed']
607594
(mixedscript_confusable, mixedscript_confusable_unresolved) = load_potential_mixedscript_confusables("confusables.txt", identifier_allowed, scripts)
@@ -688,6 +675,8 @@ def emit_util_mod(f):
688675
emit_util_mod(rf)
689676
### identifier module
690677
emit_identifier_module(rf)
678+
### default_ignorable_detection module
679+
emit_default_ignorable_detection_module(rf)
691680
### confusable_detection module
692681
emit_confusable_detection_module(rf)
693682
### mixed_script_confusable_detection module

src/confusable_detection.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@ fn char_prototype(c: char) -> OnceOrMore<char, StaticSliceIterCloned> {
3434

3535
/// Calculate skeleton for string, as defined by UTS 39
3636
pub fn skeleton(s: &str) -> impl Iterator<Item = char> + '_ {
37+
use crate::tables::default_ignorable_code_point::default_ignorable_code_point;
3738
use unicode_normalization::UnicodeNormalization;
38-
s.chars().nfd().flat_map(char_prototype).nfd()
39+
40+
s.chars()
41+
.nfd()
42+
.filter(|c| !default_ignorable_code_point(*c))
43+
.flat_map(char_prototype)
44+
.nfd()
3945
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy