Skip to content

Commit f35d6b6

Browse files
authored
Merge pull request #6 from unicode-rs/mixed
Add mixed-script detection
2 parents 03c709c + 633ee3e commit f35d6b6

File tree

5 files changed

+166
-21
lines changed

5 files changed

+166
-21
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ according to Unicode Technical Standard #39 rules.
1616
exclude = [ "target/*", "Cargo.lock" ]
1717

1818
[dependencies]
19+
unicode-script = { git = "https://github.com/unicode-rs/unicode-script", default-features = false }
1920
std = { version = "1.0", package = "rustc-std-workspace-std", optional = true }
2021
core = { version = "1.0", package = "rustc-std-workspace-core", optional = true }
2122
compiler_builtins = { version = "0.1", optional = true }

src/general_security_profile.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
//! Utilities for working with the [General Security Profile](https://www.unicode.org/reports/tr39/#General_Security_Profile)
2+
//! for identifiers
3+
4+
use crate::tables::identifier_status as is;
5+
6+
/// Methods for determining characters not restricted from use for identifiers.
7+
pub trait GeneralSecurityProfile {
8+
/// Returns whether the character is not restricted from use for identifiers.
9+
fn identifier_allowed(self) -> bool;
10+
}
11+
12+
impl GeneralSecurityProfile for char {
13+
#[inline]
14+
fn identifier_allowed(self) -> bool { is::identifier_status_allowed(self) }
15+
}
16+
17+
impl GeneralSecurityProfile for &'_ str {
18+
#[inline]
19+
fn identifier_allowed(self) -> bool { self.chars().all(is::identifier_status_allowed) }
20+
}

src/lib.rs

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
//! ```rust
1616
//! extern crate unicode_security;
1717
//!
18-
//! use unicode_security::IdentifierStatusChar;
18+
//! use unicode_security::GeneralSecurityProfile;
1919
//!
2020
//! fn main() {
2121
//! let ch = 'µ'; // U+00B5 MICRO SIGN
@@ -55,21 +55,16 @@ extern crate std;
5555
#[cfg(feature = "bench")]
5656
extern crate test;
5757

58-
use tables::identifier_status as is;
5958
pub use tables::UNICODE_VERSION;
6059

61-
mod tables;
60+
pub mod mixed_script;
61+
pub mod general_security_profile;
6262

63-
#[cfg(test)]
64-
mod tests;
63+
pub use mixed_script::MixedScript;
64+
pub use general_security_profile::GeneralSecurityProfile;
6565

66-
/// Methods for determining characters not restricted from use for identifiers.
67-
pub trait UnicodeIdentifierStatus {
68-
/// Returns whether the character is not restricted from use for identifiers.
69-
fn identifier_allowed(self) -> bool;
70-
}
66+
#[rustfmt::skip]
67+
pub(crate) mod tables;
7168

72-
impl UnicodeIdentifierStatus for char {
73-
#[inline]
74-
fn identifier_allowed(self) -> bool { is::identifier_status_allowed(self) }
75-
}
69+
#[cfg(test)]
70+
mod tests;

src/mixed_script.rs

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
//! [Mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection)
2+
3+
use unicode_script::{Script, ScriptExtension};
4+
5+
/// An Augmented script set, as defined by UTS 39
6+
///
7+
/// https://www.unicode.org/reports/tr39/#def-augmented-script-set
8+
pub struct AugmentedScriptSet {
9+
/// The base ScriptExtension value
10+
pub base: ScriptExtension,
11+
/// Han With Bopomofo
12+
pub hanb: bool,
13+
/// Japanese
14+
pub jpan: bool,
15+
/// Korean
16+
pub kore: bool,
17+
}
18+
19+
impl From<ScriptExtension> for AugmentedScriptSet {
20+
fn from(ext: ScriptExtension) -> Self {
21+
let mut hanb = false;
22+
let mut jpan = false;
23+
let mut kore = false;
24+
25+
if ext == ScriptExtension::Single(Script::Common) ||
26+
ext == ScriptExtension::Single(Script::Inherited) ||
27+
ext.contains_script(Script::Han) {
28+
hanb = true;
29+
jpan = true;
30+
kore = true;
31+
} else {
32+
if ext.contains_script(Script::Hiragana) || ext.contains_script(Script::Katakana) {
33+
jpan = true;
34+
}
35+
36+
if ext.contains_script(Script::Hangul) {
37+
kore = true;
38+
}
39+
40+
if ext.contains_script(Script::Bopomofo) {
41+
hanb = true;
42+
}
43+
}
44+
Self {
45+
base: ext,
46+
hanb, jpan, kore
47+
}
48+
}
49+
}
50+
51+
impl From<char> for AugmentedScriptSet {
52+
fn from(c: char) -> Self {
53+
AugmentedScriptSet::for_char(c)
54+
}
55+
}
56+
57+
impl From<&'_ str> for AugmentedScriptSet {
58+
fn from(s: &'_ str) -> Self {
59+
AugmentedScriptSet::for_str(s)
60+
}
61+
}
62+
63+
impl Default for AugmentedScriptSet {
64+
fn default() -> Self {
65+
AugmentedScriptSet {
66+
base: ScriptExtension::Single(Script::Common),
67+
hanb: true,
68+
jpan: true,
69+
kore: true,
70+
}
71+
}
72+
}
73+
74+
impl AugmentedScriptSet {
75+
/// Intersect this set with another
76+
pub fn intersect(mut self, other: Self) -> Self {
77+
self.base = self.base.intersect(other.base);
78+
self.hanb = self.hanb && other.hanb;
79+
self.jpan = self.jpan && other.jpan;
80+
self.kore = self.kore && other.kore;
81+
self
82+
}
83+
84+
/// Check if the set is empty
85+
pub fn is_empty(&self) -> bool {
86+
self.base.is_empty() && ! self.hanb && !self.jpan && !self.kore
87+
}
88+
89+
/// Check if the set is "All" (Common or Inherited)
90+
pub fn is_all(&self) -> bool {
91+
self.base == ScriptExtension::Single(Script::Common) ||
92+
self.base == ScriptExtension::Single(Script::Inherited)
93+
}
94+
95+
/// Construct an AugmentedScriptSet for a given character
96+
pub fn for_char(c: char) -> Self {
97+
ScriptExtension::from(c).into()
98+
}
99+
100+
/// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string
101+
pub fn for_str(s: &str) -> Self {
102+
let mut set = AugmentedScriptSet::default();
103+
for ch in s.chars() {
104+
set = set.intersect(ch.into())
105+
}
106+
set
107+
}
108+
}
109+
110+
/// Extension trait for [mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection)
111+
pub trait MixedScript {
112+
/// Check if a string is [single-script](https://www.unicode.org/reports/tr39/#def-single-script)
113+
///
114+
/// Note that a single-script string may still contain multiple Script properties!
115+
fn is_single_script(self) -> bool;
116+
117+
/// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string
118+
fn resolve_script_set(self) -> AugmentedScriptSet;
119+
}
120+
121+
impl MixedScript for &'_ str {
122+
fn is_single_script(self) -> bool {
123+
!AugmentedScriptSet::for_str(self).is_empty()
124+
}
125+
126+
fn resolve_script_set(self) -> AugmentedScriptSet {
127+
self.into()
128+
}
129+
}

src/tests.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,19 @@
1010

1111
#[test]
1212
fn test_char() {
13-
use super::IdentifierStatusChar;
14-
assert_eq!(IdentifierStatusChar::identifier_allowed('A'), true);
13+
use crate::GeneralSecurityProfile;
14+
assert_eq!(GeneralSecurityProfile::identifier_allowed('A'), true);
1515
assert_eq!('A'.identifier_allowed(), true);
16-
assert_eq!(IdentifierStatusChar::identifier_allowed('0'), true);
16+
assert_eq!(GeneralSecurityProfile::identifier_allowed('0'), true);
1717
assert_eq!('0'.identifier_allowed(), true);
18-
assert_eq!(IdentifierStatusChar::identifier_allowed('_'), true);
18+
assert_eq!(GeneralSecurityProfile::identifier_allowed('_'), true);
1919
assert_eq!('_'.identifier_allowed(), true);
20-
assert_eq!(IdentifierStatusChar::identifier_allowed('\x00'), false);
20+
assert_eq!(GeneralSecurityProfile::identifier_allowed('\x00'), false);
2121
assert_eq!('\x00'.identifier_allowed(), false);
2222
// U+00B5 MICRO SIGN
23-
assert_eq!(IdentifierStatusChar::identifier_allowed('µ'), false);
23+
assert_eq!(GeneralSecurityProfile::identifier_allowed('µ'), false);
2424
assert_eq!('µ'.identifier_allowed(), false);
2525
// U+2160 ROMAN NUMERAL ONE
26-
assert_eq!(IdentifierStatusChar::identifier_allowed('Ⅰ'), false);
26+
assert_eq!(GeneralSecurityProfile::identifier_allowed('Ⅰ'), false);
2727
assert_eq!('Ⅰ'.identifier_allowed(), false);
2828
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy