Skip to content

Commit 7786cb6

Browse files
committed
Implement confusable detection.
1 parent aab7137 commit 7786cb6

File tree

6 files changed

+2660
-0
lines changed

6 files changed

+2660
-0
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ exclude = [ "target/*", "Cargo.lock" ]
1717

1818
[dependencies]
1919
unicode-script = { version = "0.4.0", default-features = false }
20+
unicode-normalization = { version = "0.1.12", default-features = false }
2021
std = { version = "1.0", package = "rustc-std-workspace-std", optional = true }
2122
core = { version = "1.0", package = "rustc-std-workspace-core", optional = true }
2223
compiler_builtins = { version = "0.1", optional = true }

scripts/unicode.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,28 @@ def load_properties(f, interestingprops = None):
8181

8282
return props
8383

84+
def load_confusables(f):
85+
fetch(f)
86+
confusables = []
87+
re1 = re.compile(r"^((?:[0-9A-F]+ )+);\t((?:[0-9A-F]+ )+);\t\w*")
88+
89+
for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
90+
d_input = 0
91+
d_outputs = []
92+
m = re1.match(line)
93+
if not m:
94+
continue
95+
d_inputs = m.group(1).split()
96+
if len(d_inputs) != 1:
97+
raise Exception('More than one code point in first column')
98+
d_input = int(d_inputs[0].strip(), 16)
99+
for d_output in m.group(2).split():
100+
d_outputitem = int(d_output, 16);
101+
d_outputs.append(d_outputitem);
102+
confusables.append((d_input, d_outputs))
103+
104+
return confusables
105+
84106
def format_table_content(f, content, indent):
85107
line = " "*indent
86108
first = True
@@ -99,6 +121,18 @@ def format_table_content(f, content, indent):
99121
def escape_char(c):
100122
return "'\\u{%x}'" % c
101123

124+
def escape_char_list(l):
125+
line = "[";
126+
first = True;
127+
for c in l:
128+
if first:
129+
line += escape_char(c);
130+
else:
131+
line += ", " + escape_char(c);
132+
first = False;
133+
line += "]";
134+
return line
135+
102136
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
103137
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
104138
pub_string = "const"
@@ -173,10 +207,51 @@ def emit_identifier_module(f):
173207
pfun=lambda x: "(%s,%s, IdentifierType::%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
174208
f.write("}\n\n")
175209

210+
def emit_confusable_detection_module(f):
211+
f.write("pub mod confusable_detection {")
212+
f.write("""
213+
214+
#[inline]
215+
pub fn char_confusable_prototype(c: char) -> Option<&'static [char]> {
216+
// FIXME: do we want to special case ASCII here?
217+
match c as usize {
218+
_ => super::util::bsearch_value_table(c, CONFUSABLES)
219+
}
220+
}
221+
222+
""")
223+
224+
f.write(" // Confusable table:\n")
225+
confusable_table = load_confusables("confusables.txt")
226+
confusable_table.sort(key=lambda w: w[0])
227+
228+
last_key = None
229+
for (k, v) in confusable_table:
230+
if k == last_key:
231+
raise Exception("duplicate keys in confusables table: %s" % k)
232+
last_key = k
233+
234+
emit_table(f, "CONFUSABLES", confusable_table, "&'static [(char, &'static [char])]", is_pub=False,
235+
pfun=lambda x: "(%s, &%s)" % (escape_char(x[0]), escape_char_list(x[1])))
236+
f.write("}\n\n")
237+
238+
176239
def emit_util_mod(f):
177240
f.write("""
178241
pub mod util {
179242
use core::result::Result::{Ok, Err};
243+
244+
#[inline]
245+
pub fn bsearch_value_table<T: Copy>(c: char, r: &'static [(char, T)]) -> Option<T> {
246+
match r.binary_search_by_key(&c, |&(k, _)| k) {
247+
Ok(idx) => {
248+
let (_, v) = r[idx];
249+
Some(v)
250+
}
251+
Err(_) => None
252+
}
253+
}
254+
180255
#[inline]
181256
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
182257
use core::cmp::Ordering::{Equal, Less, Greater};
@@ -224,3 +299,5 @@ def emit_util_mod(f):
224299
emit_util_mod(rf)
225300
### identifier module
226301
emit_identifier_module(rf)
302+
### confusable_detection module
303+
emit_confusable_detection_module(rf)

src/confusable_detection.rs

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
//! [Confusable detection](https://www.unicode.org/reports/tr39/#Confusable_Detection)
2+
3+
use core::iter;
4+
5+
enum OnceOrMore<T, I> {
6+
Once(iter::Once<T>),
7+
More(I),
8+
}
9+
10+
impl<T, I> Iterator for OnceOrMore<T, I>
11+
where
12+
I: Iterator<Item = T>,
13+
{
14+
type Item = T;
15+
16+
fn next(&mut self) -> Option<T> {
17+
use OnceOrMore::*;
18+
match self {
19+
Once(v) => v.next(),
20+
More(i) => i.next(),
21+
}
22+
}
23+
}
24+
25+
type StaticSliceIterCloned = core::iter::Cloned<core::slice::Iter<'static, char>>;
26+
27+
fn char_prototype(c: char) -> OnceOrMore<char, StaticSliceIterCloned> {
28+
use crate::tables::confusable_detection::char_confusable_prototype;
29+
match char_confusable_prototype(c) {
30+
None => OnceOrMore::Once(iter::once(c)),
31+
Some(l) => OnceOrMore::More(l.iter().cloned()),
32+
}
33+
}
34+
35+
/// Calculate skeleton for string, as defined by UTS 39
36+
pub fn skeleton(s: &str) -> impl Iterator<Item = char> + '_ {
37+
use unicode_normalization::UnicodeNormalization;
38+
s.chars().nfd().flat_map(char_prototype).nfd()
39+
}

src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,12 @@ extern crate test;
5858

5959
pub use tables::UNICODE_VERSION;
6060

61+
pub mod confusable_detection;
6162
pub mod general_security_profile;
6263
pub mod mixed_script;
6364
pub mod restriction_level;
6465

66+
pub use confusable_detection::skeleton;
6567
pub use general_security_profile::GeneralSecurityProfile;
6668
pub use mixed_script::MixedScript;
6769
pub use restriction_level::{RestrictionLevel, RestrictionLevelDetection};

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy