From b56f3a11f6c22d8ce7b3964bec13a7a40d170e2f Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 1 Jan 2020 15:43:51 +0530 Subject: [PATCH 1/3] Add mixed-script detection --- Cargo.toml | 1 + src/lib.rs | 2 + src/mixed_script.rs | 121 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 124 insertions(+) create mode 100644 src/mixed_script.rs diff --git a/Cargo.toml b/Cargo.toml index 6876866..5817881 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ according to Unicode Technical Standard #39 rules. exclude = [ "target/*", "Cargo.lock" ] [dependencies] +unicode-script = { git = "https://github.com/unicode-rs/unicode-script", default-features = false } std = { version = "1.0", package = "rustc-std-workspace-std", optional = true } core = { version = "1.0", package = "rustc-std-workspace-core", optional = true } compiler_builtins = { version = "0.1", optional = true } diff --git a/src/lib.rs b/src/lib.rs index 75cf4bc..f04d812 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -58,6 +58,8 @@ extern crate test; use tables::identifier_status as is; pub use tables::UNICODE_VERSION; +pub mod mixed_script; + mod tables; #[cfg(test)] diff --git a/src/mixed_script.rs b/src/mixed_script.rs new file mode 100644 index 0000000..904f9ab --- /dev/null +++ b/src/mixed_script.rs @@ -0,0 +1,121 @@ +//! [Mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection) + +use unicode_script::{Script, ScriptExtension}; + +/// An Augmented script set, as defined by UTS 39 +/// +/// https://www.unicode.org/reports/tr39/#def-augmented-script-set +pub struct AugmentedScriptSet { + /// The base ScriptExtension value + pub base: ScriptExtension, + /// Han With Bopomofo + pub hanb: bool, + /// Japanese + pub jpan: bool, + /// Korean + pub kore: bool, +} + +impl From for AugmentedScriptSet { + fn from(ext: ScriptExtension) -> Self { + let mut hanb = false; + let mut jpan = false; + let mut kore = false; + + if ext.contains_script(Script::Han) { + hanb = true; + jpan = true; + kore = true; + } else { + if ext.contains_script(Script::Hiragana) || ext.contains_script(Script::Katakana) { + jpan = true; + } + + if ext.contains_script(Script::Hangul) { + kore = true; + } + + if ext.contains_script(Script::Bopomofo) { + hanb = true; + } + } + Self { + base: ext, + hanb, jpan, kore + } + } +} + +impl From for AugmentedScriptSet { + fn from(c: char) -> Self { + AugmentedScriptSet::for_char(c) + } +} + +impl From<&'_ str> for AugmentedScriptSet { + fn from(s: &'_ str) -> Self { + AugmentedScriptSet::for_str(s) + } +} + +impl Default for AugmentedScriptSet { + fn default() -> Self { + AugmentedScriptSet { + base: ScriptExtension::Single(Script::Common), + hanb: true, + jpan: true, + kore: true, + } + } +} + +impl AugmentedScriptSet { + /// Intersect this set with another + pub fn intersect(mut self, other: Self) -> Self { + self.base = self.base.intersect(other.base); + self.hanb = self.hanb && other.hanb; + self.jpan = self.jpan && other.jpan; + self.kore = self.kore && other.kore; + self + } + + /// Check if the set is empty + pub fn is_empty(&self) -> bool { + self.base.is_empty() && ! self.hanb && !self.jpan && !self.kore + } + + /// Construct an AugmentedScriptSet for a given character + pub fn for_char(c: char) -> Self { + ScriptExtension::from(c).into() + } + + /// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string + pub fn for_str(s: &str) -> Self { + let mut set = AugmentedScriptSet::default(); + for ch in s.chars() { + set = set.intersect(ch.into()) + } + set + } +} + +/// Extension trait for [mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection) +pub trait MixedScript { + /// Check if a string is [single-script](https://www.unicode.org/reports/tr39/#def-single-script) + /// + /// Note that a single-script string may still contain multiple Script properties! + fn is_single_script(self) -> bool; + + /// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string + fn resolve_script_set(self) -> AugmentedScriptSet; +} + +impl MixedScript for &'_ str { + fn is_single_script(self) -> bool { + !AugmentedScriptSet::for_str(self).is_empty() + } + + fn resolve_script_set(self) -> AugmentedScriptSet { + self.into() + } +} From d16286531058991a956ee4bf314786cddfc6ab59 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 1 Jan 2020 15:57:33 +0530 Subject: [PATCH 2/3] Move IdentifierStatus into its own module --- src/general_security_profile.rs | 20 ++++++++++++++++++++ src/lib.rs | 21 +++++++-------------- src/tests.rs | 14 +++++++------- 3 files changed, 34 insertions(+), 21 deletions(-) create mode 100644 src/general_security_profile.rs diff --git a/src/general_security_profile.rs b/src/general_security_profile.rs new file mode 100644 index 0000000..7db242a --- /dev/null +++ b/src/general_security_profile.rs @@ -0,0 +1,20 @@ +//! Utilities for working with the [General Security Profile](https://www.unicode.org/reports/tr39/#General_Security_Profile) +//! for identifiers + +use crate::tables::identifier_status as is; + +/// Methods for determining characters not restricted from use for identifiers. +pub trait GeneralSecurityProfile { + /// Returns whether the character is not restricted from use for identifiers. + fn identifier_allowed(self) -> bool; +} + +impl GeneralSecurityProfile for char { + #[inline] + fn identifier_allowed(self) -> bool { is::identifier_status_allowed(self) } +} + +impl GeneralSecurityProfile for &'_ str { + #[inline] + fn identifier_allowed(self) -> bool { self.chars().all(is::identifier_status_allowed) } +} diff --git a/src/lib.rs b/src/lib.rs index f04d812..8a2b5b4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,7 +15,7 @@ //! ```rust //! extern crate unicode_security; //! -//! use unicode_security::IdentifierStatusChar; +//! use unicode_security::GeneralSecurityProfile; //! //! fn main() { //! let ch = 'µ'; // U+00B5 MICRO SIGN @@ -55,23 +55,16 @@ extern crate std; #[cfg(feature = "bench")] extern crate test; -use tables::identifier_status as is; pub use tables::UNICODE_VERSION; pub mod mixed_script; +pub mod general_security_profile; -mod tables; +pub use mixed_script::MixedScript; +pub use general_security_profile::GeneralSecurityProfile; + +#[rustfmt::skip] +pub(crate) mod tables; #[cfg(test)] mod tests; - -/// Methods for determining characters not restricted from use for identifiers. -pub trait UnicodeIdentifierStatus { - /// Returns whether the character is not restricted from use for identifiers. - fn identifier_allowed(self) -> bool; -} - -impl UnicodeIdentifierStatus for char { - #[inline] - fn identifier_allowed(self) -> bool { is::identifier_status_allowed(self) } -} diff --git a/src/tests.rs b/src/tests.rs index b1f074b..ed32eae 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -10,19 +10,19 @@ #[test] fn test_char() { - use super::IdentifierStatusChar; - assert_eq!(IdentifierStatusChar::identifier_allowed('A'), true); + use crate::GeneralSecurityProfile; + assert_eq!(GeneralSecurityProfile::identifier_allowed('A'), true); assert_eq!('A'.identifier_allowed(), true); - assert_eq!(IdentifierStatusChar::identifier_allowed('0'), true); + assert_eq!(GeneralSecurityProfile::identifier_allowed('0'), true); assert_eq!('0'.identifier_allowed(), true); - assert_eq!(IdentifierStatusChar::identifier_allowed('_'), true); + assert_eq!(GeneralSecurityProfile::identifier_allowed('_'), true); assert_eq!('_'.identifier_allowed(), true); - assert_eq!(IdentifierStatusChar::identifier_allowed('\x00'), false); + assert_eq!(GeneralSecurityProfile::identifier_allowed('\x00'), false); assert_eq!('\x00'.identifier_allowed(), false); // U+00B5 MICRO SIGN - assert_eq!(IdentifierStatusChar::identifier_allowed('µ'), false); + assert_eq!(GeneralSecurityProfile::identifier_allowed('µ'), false); assert_eq!('µ'.identifier_allowed(), false); // U+2160 ROMAN NUMERAL ONE - assert_eq!(IdentifierStatusChar::identifier_allowed('Ⅰ'), false); + assert_eq!(GeneralSecurityProfile::identifier_allowed('Ⅰ'), false); assert_eq!('Ⅰ'.identifier_allowed(), false); } From 633ee3eb5241306979f536fefa4ffb0223df6fea Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Wed, 1 Jan 2020 16:05:29 +0530 Subject: [PATCH 3/3] Handle All correctly --- src/mixed_script.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/mixed_script.rs b/src/mixed_script.rs index 904f9ab..0cc12bf 100644 --- a/src/mixed_script.rs +++ b/src/mixed_script.rs @@ -22,7 +22,9 @@ impl From for AugmentedScriptSet { let mut jpan = false; let mut kore = false; - if ext.contains_script(Script::Han) { + if ext == ScriptExtension::Single(Script::Common) || + ext == ScriptExtension::Single(Script::Inherited) || + ext.contains_script(Script::Han) { hanb = true; jpan = true; kore = true; @@ -84,6 +86,12 @@ impl AugmentedScriptSet { self.base.is_empty() && ! self.hanb && !self.jpan && !self.kore } + /// Check if the set is "All" (Common or Inherited) + pub fn is_all(&self) -> bool { + self.base == ScriptExtension::Single(Script::Common) || + self.base == ScriptExtension::Single(Script::Inherited) + } + /// Construct an AugmentedScriptSet for a given character pub fn for_char(c: char) -> Self { ScriptExtension::from(c).into() pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy