Skip to content

Commit a18eb9c

Browse files
committed
Clean up the interface, add simple tests.
1 parent 741303d commit a18eb9c

File tree

6 files changed

+107
-197
lines changed

6 files changed

+107
-197
lines changed

scripts/unicode.py

Lines changed: 20 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ def is_codepoint_identifier_allowed(c, identifier_allowed):
239239
return True
240240
return False
241241

242-
def load_rustc_mixedscript_confusables(f, identifier_allowed, scripts):
242+
def load_potential_mixedscript_confusables(f, identifier_allowed, scripts):
243243
# First, load all confusables data from confusables.txt
244244
confusables = load_confusables(f)
245245

@@ -248,15 +248,6 @@ def load_rustc_mixedscript_confusables(f, identifier_allowed, scripts):
248248
# seen as substitutes to itself. So if the confusables.txt says A -> C, B -> C,
249249
# and implicitly C -> C, it means A <-> B, A <-> C, B <-> C are confusable.
250250

251-
# here we first make a dict that contains all As and Bs whose corresponding C is single code point.
252-
seekup_map = {}
253-
for item in confusables:
254-
d_proto_list = item[1]
255-
d_source = item[0]
256-
assert(len(d_proto_list) > 0)
257-
if len(d_proto_list) == 1:
258-
seekup_map[escape_char(d_source)] = d_proto_list
259-
260251
# Here we're dividing all confusable lhs and rhs(prototype) operands of the substitution into equivalence classes.
261252
# Principally we'll be using the rhs operands as the representive element of its equivalence classes.
262253
# However some rhs operands are single code point, while some others are not.
@@ -275,9 +266,8 @@ def load_rustc_mixedscript_confusables(f, identifier_allowed, scripts):
275266
if d_proto not in codepoint_map:
276267
codepoint_map[d_proto] = []
277268
# when we create new equivalence class, we'll check whether the representative element should be collected.
278-
# i.e. if it is not subject to substituion, and not restricted from identifier usage,
279-
# we collect it into the equivalence class.
280-
if d_proto not in seekup_map and is_codepoint_identifier_allowed(d_proto_list[0], identifier_allowed):
269+
# i.e. if it is not restricted from identifier usage, we collect it into the equivalence class.
270+
if is_codepoint_identifier_allowed(d_proto_list[0], identifier_allowed):
281271
codepoint_map[d_proto].append(d_proto_list[0])
282272
# we collect the original code point to be substituted into this list.
283273
codepoint_map[d_proto].append(d_source)
@@ -562,23 +552,20 @@ def emit_confusable_detection_module(f):
562552
def escape_script_constant(name, longforms):
563553
return "Script::" + longforms[name].strip()
564554

565-
def emit_rustc_mixed_script_confusable_detection(f):
566-
f.write("pub mod rustc_mixed_script_confusable_detection {")
555+
def emit_potiential_mixed_script_confusable(f):
556+
f.write("pub mod potential_mixed_script_confusable {")
567557
f.write("""
568-
use unicode_script::Script;
569-
570558
#[inline]
571-
pub fn is_rustc_mixed_script_confusable(c: char) -> Option<Script> {
559+
pub fn potential_mixed_script_confusable(c: char) -> bool {
572560
match c as usize {
573-
_ => super::util::bsearch_value_table(c, CONFUSABLES)
561+
_ => super::util::bsearch_table(c, CONFUSABLES)
574562
}
575563
}
576-
577564
""")
578565
identifier_status_table = load_properties("IdentifierStatus.txt")
579-
longforms, scripts = load_scripts("Scripts.txt")
566+
_, scripts = load_scripts("Scripts.txt")
580567
identifier_allowed = identifier_status_table['Allowed']
581-
(mixedscript_confusable, mixedscript_confusable_unresolved) = load_rustc_mixedscript_confusables("confusables.txt", identifier_allowed, scripts)
568+
(mixedscript_confusable, mixedscript_confusable_unresolved) = load_potential_mixedscript_confusables("confusables.txt", identifier_allowed, scripts)
582569
debug = False
583570
if debug == True:
584571
debug_emit_mixedscript_confusable(f, mixedscript_confusable, "mixedscript_confusable", scripts)
@@ -589,16 +576,21 @@ def emit_rustc_mixed_script_confusable_detection(f):
589576
source = pair[0]
590577
confusable_table.append((source, script))
591578
confusable_table.sort(key=lambda w: w[0])
592-
emit_table(f, "CONFUSABLES", confusable_table, "&'static [(char, Script)]", is_pub=False,
593-
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_script_constant(x[1], longforms)))
579+
emit_table(f, "CONFUSABLES", confusable_table, "&'static [char]", is_pub=False,
580+
pfun=lambda x: "%s" % escape_char(x[0]))
594581
f.write("}\n\n")
595582

596583

597584
def emit_util_mod(f):
598585
f.write("""
599586
pub mod util {
600587
use core::result::Result::{Ok, Err};
601-
588+
589+
#[inline]
590+
pub fn bsearch_table(c: char, r: &'static [char]) -> bool {
591+
r.binary_search(&c).is_ok()
592+
}
593+
602594
#[inline]
603595
pub fn bsearch_value_table<T: Copy>(c: char, r: &'static [(char, T)]) -> Option<T> {
604596
match r.binary_search_by_key(&c, |&(k, _)| k) {
@@ -609,7 +601,7 @@ def emit_util_mod(f):
609601
Err(_) => None
610602
}
611603
}
612-
604+
613605
#[inline]
614606
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
615607
use core::cmp::Ordering::{Equal, Less, Greater};
@@ -619,7 +611,7 @@ def emit_util_mod(f):
619611
else { Greater }
620612
}).is_ok()
621613
}
622-
614+
623615
pub fn bsearch_range_value_table<T: Copy>(c: char, r: &'static [(char, char, T)]) -> Option<T> {
624616
use core::cmp::Ordering::{Equal, Less, Greater};
625617
match r.binary_search_by(|&(lo, hi, _)| {
@@ -660,4 +652,4 @@ def emit_util_mod(f):
660652
### confusable_detection module
661653
emit_confusable_detection_module(rf)
662654
### mixed_script_confusable_detection module
663-
emit_rustc_mixed_script_confusable_detection(rf)
655+
emit_potiential_mixed_script_confusable(rf)

src/lib.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,12 @@ pub mod confusable_detection;
6262
pub mod general_security_profile;
6363
pub mod mixed_script;
6464
pub mod restriction_level;
65-
pub mod rustc_mixed_script_confusable_detection;
6665

6766
pub use confusable_detection::skeleton;
6867
pub use general_security_profile::GeneralSecurityProfile;
68+
pub use mixed_script::is_potential_mixed_script_confusable_char;
6969
pub use mixed_script::MixedScript;
7070
pub use restriction_level::{RestrictionLevel, RestrictionLevelDetection};
71-
pub use rustc_mixed_script_confusable_detection::rustc_mixed_script_confusable_codepoint;
7271

7372
#[rustfmt::skip]
7473
pub(crate) mod tables;

src/mixed_script.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,3 +130,17 @@ impl MixedScript for &'_ str {
130130
self.into()
131131
}
132132
}
133+
134+
/// Check if a character is considered potential mixed script confusable.
135+
///
136+
/// If the specified character is not restricted from use for identifiers,
137+
/// this function returns whether it is considered mixed script confusable
138+
/// with another character that is not restricted from use for identifiers.
139+
///
140+
/// If the specified character is restricted from use for identifiers,
141+
/// the return value is unspecified.
142+
pub fn is_potential_mixed_script_confusable_char(c: char) -> bool {
143+
use crate::tables::potential_mixed_script_confusable::potential_mixed_script_confusable;
144+
145+
potential_mixed_script_confusable(c)
146+
}

src/rustc_mixed_script_confusable_detection.rs

Lines changed: 0 additions & 17 deletions
This file was deleted.

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy