diff --git a/Cargo.toml b/Cargo.toml index 6876866..5817881 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ according to Unicode Technical Standard #39 rules. exclude = [ "target/*", "Cargo.lock" ] [dependencies] +unicode-script = { git = "https://github.com/unicode-rs/unicode-script", default-features = false } std = { version = "1.0", package = "rustc-std-workspace-std", optional = true } core = { version = "1.0", package = "rustc-std-workspace-core", optional = true } compiler_builtins = { version = "0.1", optional = true } diff --git a/src/general_security_profile.rs b/src/general_security_profile.rs new file mode 100644 index 0000000..7db242a --- /dev/null +++ b/src/general_security_profile.rs @@ -0,0 +1,20 @@ +//! Utilities for working with the [General Security Profile](https://www.unicode.org/reports/tr39/#General_Security_Profile) +//! for identifiers + +use crate::tables::identifier_status as is; + +/// Methods for determining characters not restricted from use for identifiers. +pub trait GeneralSecurityProfile { + /// Returns whether the character is not restricted from use for identifiers. + fn identifier_allowed(self) -> bool; +} + +impl GeneralSecurityProfile for char { + #[inline] + fn identifier_allowed(self) -> bool { is::identifier_status_allowed(self) } +} + +impl GeneralSecurityProfile for &'_ str { + #[inline] + fn identifier_allowed(self) -> bool { self.chars().all(is::identifier_status_allowed) } +} diff --git a/src/lib.rs b/src/lib.rs index 75cf4bc..8a2b5b4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,7 +15,7 @@ //! ```rust //! extern crate unicode_security; //! -//! use unicode_security::IdentifierStatusChar; +//! use unicode_security::GeneralSecurityProfile; //! //! fn main() { //! let ch = 'µ'; // U+00B5 MICRO SIGN @@ -55,21 +55,16 @@ extern crate std; #[cfg(feature = "bench")] extern crate test; -use tables::identifier_status as is; pub use tables::UNICODE_VERSION; -mod tables; +pub mod mixed_script; +pub mod general_security_profile; -#[cfg(test)] -mod tests; +pub use mixed_script::MixedScript; +pub use general_security_profile::GeneralSecurityProfile; -/// Methods for determining characters not restricted from use for identifiers. -pub trait UnicodeIdentifierStatus { - /// Returns whether the character is not restricted from use for identifiers. - fn identifier_allowed(self) -> bool; -} +#[rustfmt::skip] +pub(crate) mod tables; -impl UnicodeIdentifierStatus for char { - #[inline] - fn identifier_allowed(self) -> bool { is::identifier_status_allowed(self) } -} +#[cfg(test)] +mod tests; diff --git a/src/mixed_script.rs b/src/mixed_script.rs new file mode 100644 index 0000000..0cc12bf --- /dev/null +++ b/src/mixed_script.rs @@ -0,0 +1,129 @@ +//! [Mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection) + +use unicode_script::{Script, ScriptExtension}; + +/// An Augmented script set, as defined by UTS 39 +/// +/// https://www.unicode.org/reports/tr39/#def-augmented-script-set +pub struct AugmentedScriptSet { + /// The base ScriptExtension value + pub base: ScriptExtension, + /// Han With Bopomofo + pub hanb: bool, + /// Japanese + pub jpan: bool, + /// Korean + pub kore: bool, +} + +impl From for AugmentedScriptSet { + fn from(ext: ScriptExtension) -> Self { + let mut hanb = false; + let mut jpan = false; + let mut kore = false; + + if ext == ScriptExtension::Single(Script::Common) || + ext == ScriptExtension::Single(Script::Inherited) || + ext.contains_script(Script::Han) { + hanb = true; + jpan = true; + kore = true; + } else { + if ext.contains_script(Script::Hiragana) || ext.contains_script(Script::Katakana) { + jpan = true; + } + + if ext.contains_script(Script::Hangul) { + kore = true; + } + + if ext.contains_script(Script::Bopomofo) { + hanb = true; + } + } + Self { + base: ext, + hanb, jpan, kore + } + } +} + +impl From for AugmentedScriptSet { + fn from(c: char) -> Self { + AugmentedScriptSet::for_char(c) + } +} + +impl From<&'_ str> for AugmentedScriptSet { + fn from(s: &'_ str) -> Self { + AugmentedScriptSet::for_str(s) + } +} + +impl Default for AugmentedScriptSet { + fn default() -> Self { + AugmentedScriptSet { + base: ScriptExtension::Single(Script::Common), + hanb: true, + jpan: true, + kore: true, + } + } +} + +impl AugmentedScriptSet { + /// Intersect this set with another + pub fn intersect(mut self, other: Self) -> Self { + self.base = self.base.intersect(other.base); + self.hanb = self.hanb && other.hanb; + self.jpan = self.jpan && other.jpan; + self.kore = self.kore && other.kore; + self + } + + /// Check if the set is empty + pub fn is_empty(&self) -> bool { + self.base.is_empty() && ! self.hanb && !self.jpan && !self.kore + } + + /// Check if the set is "All" (Common or Inherited) + pub fn is_all(&self) -> bool { + self.base == ScriptExtension::Single(Script::Common) || + self.base == ScriptExtension::Single(Script::Inherited) + } + + /// Construct an AugmentedScriptSet for a given character + pub fn for_char(c: char) -> Self { + ScriptExtension::from(c).into() + } + + /// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string + pub fn for_str(s: &str) -> Self { + let mut set = AugmentedScriptSet::default(); + for ch in s.chars() { + set = set.intersect(ch.into()) + } + set + } +} + +/// Extension trait for [mixed-script detection](https://www.unicode.org/reports/tr39/#Mixed_Script_Detection) +pub trait MixedScript { + /// Check if a string is [single-script](https://www.unicode.org/reports/tr39/#def-single-script) + /// + /// Note that a single-script string may still contain multiple Script properties! + fn is_single_script(self) -> bool; + + /// Find the [resolved script set](https://www.unicode.org/reports/tr39/#def-resolved-script-set) of a given string + fn resolve_script_set(self) -> AugmentedScriptSet; +} + +impl MixedScript for &'_ str { + fn is_single_script(self) -> bool { + !AugmentedScriptSet::for_str(self).is_empty() + } + + fn resolve_script_set(self) -> AugmentedScriptSet { + self.into() + } +} diff --git a/src/tests.rs b/src/tests.rs index b1f074b..ed32eae 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -10,19 +10,19 @@ #[test] fn test_char() { - use super::IdentifierStatusChar; - assert_eq!(IdentifierStatusChar::identifier_allowed('A'), true); + use crate::GeneralSecurityProfile; + assert_eq!(GeneralSecurityProfile::identifier_allowed('A'), true); assert_eq!('A'.identifier_allowed(), true); - assert_eq!(IdentifierStatusChar::identifier_allowed('0'), true); + assert_eq!(GeneralSecurityProfile::identifier_allowed('0'), true); assert_eq!('0'.identifier_allowed(), true); - assert_eq!(IdentifierStatusChar::identifier_allowed('_'), true); + assert_eq!(GeneralSecurityProfile::identifier_allowed('_'), true); assert_eq!('_'.identifier_allowed(), true); - assert_eq!(IdentifierStatusChar::identifier_allowed('\x00'), false); + assert_eq!(GeneralSecurityProfile::identifier_allowed('\x00'), false); assert_eq!('\x00'.identifier_allowed(), false); // U+00B5 MICRO SIGN - assert_eq!(IdentifierStatusChar::identifier_allowed('µ'), false); + assert_eq!(GeneralSecurityProfile::identifier_allowed('µ'), false); assert_eq!('µ'.identifier_allowed(), false); // U+2160 ROMAN NUMERAL ONE - assert_eq!(IdentifierStatusChar::identifier_allowed('Ⅰ'), false); + assert_eq!(GeneralSecurityProfile::identifier_allowed('Ⅰ'), false); assert_eq!('Ⅰ'.identifier_allowed(), false); } pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy