Skip to content

Move to using bit sets for ScriptExtension #3

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,24 @@ on: [push]

jobs:
build:

runs-on: ubuntu-latest

strategy:
matrix:
rust:
- beta
- nightly
steps:
- uses: actions/checkout@v1
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: beta
toolchain: ${{ matrix.rust }}
override: true
components: rustfmt
- name: Build
run: cargo build --verbose
- name: Run tests
run: cargo test
- name: Run benchmarks
run: cargo bench --features bench
if: startsWith(matrix.rust, 'nightly')
5 changes: 2 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "unicode-script"
version = "0.4.0"
version = "0.5.0"
authors = ["Manish Goregaokar <manishsmail@gmail.com>"]
edition = "2018"

Expand All @@ -20,9 +20,8 @@ exclude = [ "target/*", "Cargo.lock", "scripts/tmp", "*.txt" ]
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[features]
with_std = []
default_features = ["with_std"]
rustc-dep-of-std = ['std', 'core', 'compiler_builtins']
bench = []

[dependencies]
std = { version = "1.0", package = "rustc-std-workspace-std", optional = true }
Expand Down
199 changes: 61 additions & 138 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly

#![allow(missing_docs, non_upper_case_globals, non_snake_case)]

use super::ScriptExtension;
'''

UNICODE_VERSION = (12, 0, 0)
Expand Down Expand Up @@ -183,182 +185,102 @@ def emit_search(f):
}
""")

def emit_enums(f, script_list, extension_list, longforms, intersections):
def emit_enums(f, script_list, extension_list, longforms):
"""
Emit the Script and ScriptExtension enums as well as any related utility functions
"""

f.write("""
use core::convert::TryFrom;
#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)]
#[non_exhaustive]
#[allow(non_camel_case_types)]
/// A value of the Script property
#[repr(u8)]
/// A value of the `Script` property
pub enum Script {
/// Unknown script
Unknown,
Unknown = 0xFF,
/// Zyyy
Common = 0xFE,
/// Zinh,
Inherited = 0xFD,
""")
for script in script_list:
f.write(" /// %s\n %s,\n" % (script, longforms[script]))
f.write("""}
#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)]
#[non_exhaustive]
/// A value for the Script_Extension property
///
/// Script_Extension is one or more Script
///
/// This is essentially an optimized version of Vec<Script>,
/// optimized by script sets and intersections actually present in Unicode.
pub enum ScriptExtension {
/// A single script
Single(Script),
for (i, script) in enumerate(script_list):
f.write(" /// %s\n %s = %s,\n" % (script, longforms[script], i))
f.write("}\n")
f.write("pub const NEXT_SCRIPT: u8 = %s;" % len(script_list))
f.write("""

pub mod script_extensions {
use crate::ScriptExtension;
pub const COMMON: ScriptExtension = ScriptExtension::new_common();
pub const INHERITED: ScriptExtension = ScriptExtension::new_inherited();
pub const UNKNOWN: ScriptExtension = ScriptExtension::new_unknown();
""")
for (i, script) in enumerate(script_list):
first = 0
second = 0
third = 0
# need to replace L because `hex()` will spit out an L suffix for larger numbers
if i < 64:
first = hex(1 << i).replace("L", "")
elif i < 128:
second = hex(1 << (i - 64)).replace("L", "")
else:
third = hex(1 << (i - 128)).replace("L", "")
f.write(" /// %s\n pub const %s: ScriptExtension = ScriptExtension::new(%s, %s, %s);\n" %
(longforms[script], longforms[script].upper(), first, second, third))
if script != longforms[script]:
f.write(" /// %s\n pub const %s: ScriptExtension = %s;\n" %
(longforms[script], script.upper(), longforms[script].upper()))
for ext in extension_list:
longform = ", ".join([longforms[s] for s in ext])
f.write(" /// %s\n %s,\n" % (longform, "".join(ext)))
name = "_".join([s.upper() for s in ext])
expr = ext[0].upper()
for e in ext[1:]:
expr = "%s.union(%s)" % (expr, e.upper())
f.write(" /// %s\n pub const %s: ScriptExtension = %s;\n" % (longform, name, expr))
f.write("""}

impl From<Script> for ScriptExtension {
fn from(script: Script) -> Self {
ScriptExtension::Single(script)
}
}

impl TryFrom<ScriptExtension> for Script {
type Error = ();
fn try_from(ext: ScriptExtension) -> Result<Self, ()> {
match ext {
ScriptExtension::Single(s) => Ok(s),
_ => Err(())
}
}
}

impl Script {
#[inline]
pub(crate) fn inner_full_name(self) -> &'static str {
match self {
Script::Unknown => "Unknown",
Script::Common => "Common",
Script::Inherited => "Inherited",
""")
for script in script_list:
f.write(" Script::%s => \"%s\",\n" % (longforms[script], longforms[script]))
f.write(""" }
}

#[inline]
pub(crate) fn inner_short_name(self) -> &'static str {
match self {
Script::Unknown => "",
Script::Common => "Zyyy",
Script::Inherited => "Zinh",
""")
for script in script_list:
f.write(" Script::%s => \"%s\",\n" % (longforms[script], script))
f.write(""" }
}
}

impl ScriptExtension {
#[inline]
#[cfg(feature = "with_std")]
pub(crate) fn inner_scripts(self) -> Vec<Script> {
match self {
ScriptExtension::Single(s) => vec![s],
pub(crate) fn for_integer(value: u8) -> Self {
match value {
""")
for ext in extension_list:
scripts = ", ".join(["Script::%s" % longforms[s] for s in ext])
f.write(" %s => vec![%s],\n" % (extension_name(ext), scripts))
f.write(""" _ => unreachable!()
}
}

#[inline]
pub(crate) fn inner_contains_script(self, other: Script) -> bool {
match self {
ScriptExtension::Single(s) => s == other,
""")
for ext in extension_list:
scripts = " || ".join(["other == Script::%s" % longforms[s] for s in ext])
f.write(" %s => %s,\n" % (extension_name(ext), scripts))
f.write(""" }
}

#[inline]
pub(crate) fn inner_intersect(self, other: Self) -> Self {
match (self, other) {
(ScriptExtension::Single(Script::Unknown), _) |
(_, ScriptExtension::Single(Script::Unknown)) => ScriptExtension::Single(Script::Unknown),
(a, b) if a == b => a,
(ScriptExtension::Single(Script::Common), a) |
(ScriptExtension::Single(Script::Inherited), a) |
(a, ScriptExtension::Single(Script::Common)) |
(a, ScriptExtension::Single(Script::Inherited)) => a,
(ScriptExtension::Single(s), o) | (o, ScriptExtension::Single(s)) if o.inner_contains_script(s) => ScriptExtension::Single(s),
""")
for (e1, e2, i) in intersections:
f.write(" (%s, %s) => %s,\n" % (extension_name(e1), extension_name(e2), extension_name(i, longforms)))
f.write(""" _ => ScriptExtension::Single(Script::Unknown),
for (i, script) in enumerate(script_list):
f.write(" %s => Script::%s,\n" % (i, longforms[script]))
f.write(""" _ => unreachable!(),
}
}
}
""")


def compute_intersections_elements(extension_list):
"""
Compute all intersections between the script extensions.
This will add new elements to extension_list, be sure to call it first!
"""

# This is the only third-level intersection
# It's easier to hardcode things here rather than
# do the below calculation in a loop
extension_list.append(['Deva', 'Knda', 'Tirh'])
intersections = []
# Some intersections will not exist in extension_list and we'll need to add them
new_elements = []
sets = [(e, set(e)) for e in extension_list]
for (e1, s1) in sets:
for (e2, s2) in sets:
if e1 == e2:
continue
intersection = s1.intersection(s2)
if len(intersection) > 0:
intersection = [i for i in intersection]
intersection.sort()
if len(intersection) > 1 and intersection not in extension_list and intersection not in new_elements:
new_elements.append(intersection)
if (e1, e2, intersection) not in intersections:
intersections.append((e1, e2, intersection))
extension_list.extend(new_elements)

# We now go through the newly added second-level extension values and calculate their intersections
# with the original set and each other
new_sets = [(e, set(e)) for e in new_elements]
sets = [(e, set(e)) for e in extension_list]
for (e1, s1) in new_sets:
for (e2, s2) in sets:
if e1 == e2:
continue
intersection = s1.intersection(s2)
if len(intersection) > 0:
intersection = [i for i in intersection]
intersection.sort()
if len(intersection) > 1 and intersection not in extension_list:
raise "Found new third-level intersection, please hardcode it"
# The previous routine would automatically get both versions
# of an intersection because it would iterate each pair in both orders,
# but here we're working on an asymmetric pair, so we insert both in order to not
# miss anything
if (e1, e2, intersection) not in intersections:
intersections.append((e1, e2, intersection))
if (e2, e1, intersection) not in intersections:
intersections.append((e2, e1, intersection))

intersections.sort()
return intersections

def extension_name(ext, longforms={}):
def extension_name(ext):
"""Get the rust source for a given ScriptExtension"""
if len(ext) == 1:
return "ScriptExtension::Single(Script::%s)" % longforms[ext[0]]
else:
return "ScriptExtension::%s" % "".join(ext)
return "script_extensions::%s" % "_".join([e.upper() for e in ext])



Expand All @@ -385,8 +307,10 @@ def extension_name(ext, longforms={}):
script_list = []

for script in scripts:
script_list.append(shortforms[script])
if script not in ["Common", "Unknown", "Inherited"]:
script_list.append(shortforms[script])
script_table.extend([(x, y, shortforms[script]) for (x, y) in scripts[script]])
script_list.sort()
script_table.sort(key=lambda w: w[0])


Expand All @@ -404,14 +328,13 @@ def extension_name(ext, longforms={}):
extension_table.extend([(x, y, output_ext) for (x, y) in extensions[ext]])
extension_table.sort(key=lambda w: w[0])

intersections = compute_intersections_elements(extension_list)

emit_enums(rf, script_list, extension_list, longforms, intersections)
emit_enums(rf, script_list, extension_list, longforms)
emit_search(rf)

emit_table(rf, "SCRIPTS", script_table, t_type = "&'static [(char, char, Script)]",
is_pub=False , pfun=lambda x: "(%s,%s, Script::%s)" % (escape_char(x[0]), escape_char(x[1]), longforms[x[2]]))
emit_table(rf, "SCRIPT_EXTENSIONS", extension_table, t_type = "&'static [(char, char, ScriptExtension)]",
is_pub=False , pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), extension_name(x[2], longforms)))
is_pub=False , pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), extension_name(x[2])))

# emit_table(rf, "FOObar", properties)
Loading
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy