Skip to content

Commit 4a8cb11

Browse files
authored
Merge pull request #3 from unicode-rs/bits
Move to using bit sets for ScriptExtension
2 parents cbfd7bd + 91a8e06 commit 4a8cb11

File tree

5 files changed

+1974
-2466
lines changed

5 files changed

+1974
-2466
lines changed

.github/workflows/tests.yml

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,24 @@ on: [push]
44

55
jobs:
66
build:
7-
87
runs-on: ubuntu-latest
9-
8+
strategy:
9+
matrix:
10+
rust:
11+
- beta
12+
- nightly
1013
steps:
1114
- uses: actions/checkout@v1
1215
- uses: actions-rs/toolchain@v1
1316
with:
1417
profile: minimal
15-
toolchain: beta
18+
toolchain: ${{ matrix.rust }}
1619
override: true
1720
components: rustfmt
1821
- name: Build
1922
run: cargo build --verbose
2023
- name: Run tests
2124
run: cargo test
25+
- name: Run benchmarks
26+
run: cargo bench --features bench
27+
if: startsWith(matrix.rust, 'nightly')

Cargo.toml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "unicode-script"
3-
version = "0.4.0"
3+
version = "0.5.0"
44
authors = ["Manish Goregaokar <manishsmail@gmail.com>"]
55
edition = "2018"
66

@@ -20,9 +20,8 @@ exclude = [ "target/*", "Cargo.lock", "scripts/tmp", "*.txt" ]
2020
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
2121

2222
[features]
23-
with_std = []
24-
default_features = ["with_std"]
2523
rustc-dep-of-std = ['std', 'core', 'compiler_builtins']
24+
bench = []
2625

2726
[dependencies]
2827
std = { version = "1.0", package = "rustc-std-workspace-std", optional = true }

scripts/unicode.py

Lines changed: 61 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
3636
3737
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
38+
39+
use super::ScriptExtension;
3840
'''
3941

4042
UNICODE_VERSION = (12, 0, 0)
@@ -183,182 +185,102 @@ def emit_search(f):
183185
}
184186
""")
185187

186-
def emit_enums(f, script_list, extension_list, longforms, intersections):
188+
def emit_enums(f, script_list, extension_list, longforms):
187189
"""
188190
Emit the Script and ScriptExtension enums as well as any related utility functions
189191
"""
192+
190193
f.write("""
191-
use core::convert::TryFrom;
192194
#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)]
193195
#[non_exhaustive]
194196
#[allow(non_camel_case_types)]
195-
/// A value of the Script property
197+
#[repr(u8)]
198+
/// A value of the `Script` property
196199
pub enum Script {
197200
/// Unknown script
198-
Unknown,
201+
Unknown = 0xFF,
202+
/// Zyyy
203+
Common = 0xFE,
204+
/// Zinh,
205+
Inherited = 0xFD,
199206
""")
200-
for script in script_list:
201-
f.write(" /// %s\n %s,\n" % (script, longforms[script]))
202-
f.write("""}
203-
#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)]
204-
#[non_exhaustive]
205-
/// A value for the Script_Extension property
206-
///
207-
/// Script_Extension is one or more Script
208-
///
209-
/// This is essentially an optimized version of Vec<Script>,
210-
/// optimized by script sets and intersections actually present in Unicode.
211-
pub enum ScriptExtension {
212-
/// A single script
213-
Single(Script),
207+
for (i, script) in enumerate(script_list):
208+
f.write(" /// %s\n %s = %s,\n" % (script, longforms[script], i))
209+
f.write("}\n")
210+
f.write("pub const NEXT_SCRIPT: u8 = %s;" % len(script_list))
211+
f.write("""
212+
213+
pub mod script_extensions {
214+
use crate::ScriptExtension;
215+
pub const COMMON: ScriptExtension = ScriptExtension::new_common();
216+
pub const INHERITED: ScriptExtension = ScriptExtension::new_inherited();
217+
pub const UNKNOWN: ScriptExtension = ScriptExtension::new_unknown();
214218
""")
219+
for (i, script) in enumerate(script_list):
220+
first = 0
221+
second = 0
222+
third = 0
223+
# need to replace L because `hex()` will spit out an L suffix for larger numbers
224+
if i < 64:
225+
first = hex(1 << i).replace("L", "")
226+
elif i < 128:
227+
second = hex(1 << (i - 64)).replace("L", "")
228+
else:
229+
third = hex(1 << (i - 128)).replace("L", "")
230+
f.write(" /// %s\n pub const %s: ScriptExtension = ScriptExtension::new(%s, %s, %s);\n" %
231+
(longforms[script], longforms[script].upper(), first, second, third))
232+
if script != longforms[script]:
233+
f.write(" /// %s\n pub const %s: ScriptExtension = %s;\n" %
234+
(longforms[script], script.upper(), longforms[script].upper()))
215235
for ext in extension_list:
216236
longform = ", ".join([longforms[s] for s in ext])
217-
f.write(" /// %s\n %s,\n" % (longform, "".join(ext)))
237+
name = "_".join([s.upper() for s in ext])
238+
expr = ext[0].upper()
239+
for e in ext[1:]:
240+
expr = "%s.union(%s)" % (expr, e.upper())
241+
f.write(" /// %s\n pub const %s: ScriptExtension = %s;\n" % (longform, name, expr))
218242
f.write("""}
219243
220-
impl From<Script> for ScriptExtension {
221-
fn from(script: Script) -> Self {
222-
ScriptExtension::Single(script)
223-
}
224-
}
225-
226-
impl TryFrom<ScriptExtension> for Script {
227-
type Error = ();
228-
fn try_from(ext: ScriptExtension) -> Result<Self, ()> {
229-
match ext {
230-
ScriptExtension::Single(s) => Ok(s),
231-
_ => Err(())
232-
}
233-
}
234-
}
235-
236244
impl Script {
245+
#[inline]
237246
pub(crate) fn inner_full_name(self) -> &'static str {
238247
match self {
239248
Script::Unknown => "Unknown",
249+
Script::Common => "Common",
250+
Script::Inherited => "Inherited",
240251
""")
241252
for script in script_list:
242253
f.write(" Script::%s => \"%s\",\n" % (longforms[script], longforms[script]))
243254
f.write(""" }
244255
}
245256
257+
#[inline]
246258
pub(crate) fn inner_short_name(self) -> &'static str {
247259
match self {
248260
Script::Unknown => "",
261+
Script::Common => "Zyyy",
262+
Script::Inherited => "Zinh",
249263
""")
250264
for script in script_list:
251265
f.write(" Script::%s => \"%s\",\n" % (longforms[script], script))
252266
f.write(""" }
253267
}
254-
}
255268
256-
impl ScriptExtension {
257269
#[inline]
258-
#[cfg(feature = "with_std")]
259-
pub(crate) fn inner_scripts(self) -> Vec<Script> {
260-
match self {
261-
ScriptExtension::Single(s) => vec![s],
270+
pub(crate) fn for_integer(value: u8) -> Self {
271+
match value {
262272
""")
263-
for ext in extension_list:
264-
scripts = ", ".join(["Script::%s" % longforms[s] for s in ext])
265-
f.write(" %s => vec![%s],\n" % (extension_name(ext), scripts))
266-
f.write(""" _ => unreachable!()
267-
}
268-
}
269-
270-
#[inline]
271-
pub(crate) fn inner_contains_script(self, other: Script) -> bool {
272-
match self {
273-
ScriptExtension::Single(s) => s == other,
274-
""")
275-
for ext in extension_list:
276-
scripts = " || ".join(["other == Script::%s" % longforms[s] for s in ext])
277-
f.write(" %s => %s,\n" % (extension_name(ext), scripts))
278-
f.write(""" }
279-
}
280-
281-
#[inline]
282-
pub(crate) fn inner_intersect(self, other: Self) -> Self {
283-
match (self, other) {
284-
(ScriptExtension::Single(Script::Unknown), _) |
285-
(_, ScriptExtension::Single(Script::Unknown)) => ScriptExtension::Single(Script::Unknown),
286-
(a, b) if a == b => a,
287-
(ScriptExtension::Single(Script::Common), a) |
288-
(ScriptExtension::Single(Script::Inherited), a) |
289-
(a, ScriptExtension::Single(Script::Common)) |
290-
(a, ScriptExtension::Single(Script::Inherited)) => a,
291-
(ScriptExtension::Single(s), o) | (o, ScriptExtension::Single(s)) if o.inner_contains_script(s) => ScriptExtension::Single(s),
292-
""")
293-
for (e1, e2, i) in intersections:
294-
f.write(" (%s, %s) => %s,\n" % (extension_name(e1), extension_name(e2), extension_name(i, longforms)))
295-
f.write(""" _ => ScriptExtension::Single(Script::Unknown),
273+
for (i, script) in enumerate(script_list):
274+
f.write(" %s => Script::%s,\n" % (i, longforms[script]))
275+
f.write(""" _ => unreachable!(),
296276
}
297277
}
298278
}
299279
""")
300280

301-
302-
def compute_intersections_elements(extension_list):
303-
"""
304-
Compute all intersections between the script extensions.
305-
This will add new elements to extension_list, be sure to call it first!
306-
"""
307-
308-
# This is the only third-level intersection
309-
# It's easier to hardcode things here rather than
310-
# do the below calculation in a loop
311-
extension_list.append(['Deva', 'Knda', 'Tirh'])
312-
intersections = []
313-
# Some intersections will not exist in extension_list and we'll need to add them
314-
new_elements = []
315-
sets = [(e, set(e)) for e in extension_list]
316-
for (e1, s1) in sets:
317-
for (e2, s2) in sets:
318-
if e1 == e2:
319-
continue
320-
intersection = s1.intersection(s2)
321-
if len(intersection) > 0:
322-
intersection = [i for i in intersection]
323-
intersection.sort()
324-
if len(intersection) > 1 and intersection not in extension_list and intersection not in new_elements:
325-
new_elements.append(intersection)
326-
if (e1, e2, intersection) not in intersections:
327-
intersections.append((e1, e2, intersection))
328-
extension_list.extend(new_elements)
329-
330-
# We now go through the newly added second-level extension values and calculate their intersections
331-
# with the original set and each other
332-
new_sets = [(e, set(e)) for e in new_elements]
333-
sets = [(e, set(e)) for e in extension_list]
334-
for (e1, s1) in new_sets:
335-
for (e2, s2) in sets:
336-
if e1 == e2:
337-
continue
338-
intersection = s1.intersection(s2)
339-
if len(intersection) > 0:
340-
intersection = [i for i in intersection]
341-
intersection.sort()
342-
if len(intersection) > 1 and intersection not in extension_list:
343-
raise "Found new third-level intersection, please hardcode it"
344-
# The previous routine would automatically get both versions
345-
# of an intersection because it would iterate each pair in both orders,
346-
# but here we're working on an asymmetric pair, so we insert both in order to not
347-
# miss anything
348-
if (e1, e2, intersection) not in intersections:
349-
intersections.append((e1, e2, intersection))
350-
if (e2, e1, intersection) not in intersections:
351-
intersections.append((e2, e1, intersection))
352-
353-
intersections.sort()
354-
return intersections
355-
356-
def extension_name(ext, longforms={}):
281+
def extension_name(ext):
357282
"""Get the rust source for a given ScriptExtension"""
358-
if len(ext) == 1:
359-
return "ScriptExtension::Single(Script::%s)" % longforms[ext[0]]
360-
else:
361-
return "ScriptExtension::%s" % "".join(ext)
283+
return "script_extensions::%s" % "_".join([e.upper() for e in ext])
362284

363285

364286

@@ -385,8 +307,10 @@ def extension_name(ext, longforms={}):
385307
script_list = []
386308

387309
for script in scripts:
388-
script_list.append(shortforms[script])
310+
if script not in ["Common", "Unknown", "Inherited"]:
311+
script_list.append(shortforms[script])
389312
script_table.extend([(x, y, shortforms[script]) for (x, y) in scripts[script]])
313+
script_list.sort()
390314
script_table.sort(key=lambda w: w[0])
391315

392316

@@ -404,14 +328,13 @@ def extension_name(ext, longforms={}):
404328
extension_table.extend([(x, y, output_ext) for (x, y) in extensions[ext]])
405329
extension_table.sort(key=lambda w: w[0])
406330

407-
intersections = compute_intersections_elements(extension_list)
408331

409-
emit_enums(rf, script_list, extension_list, longforms, intersections)
332+
emit_enums(rf, script_list, extension_list, longforms)
410333
emit_search(rf)
411334

412335
emit_table(rf, "SCRIPTS", script_table, t_type = "&'static [(char, char, Script)]",
413336
is_pub=False , pfun=lambda x: "(%s,%s, Script::%s)" % (escape_char(x[0]), escape_char(x[1]), longforms[x[2]]))
414337
emit_table(rf, "SCRIPT_EXTENSIONS", extension_table, t_type = "&'static [(char, char, ScriptExtension)]",
415-
is_pub=False , pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), extension_name(x[2], longforms)))
338+
is_pub=False , pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), extension_name(x[2])))
416339

417340
# emit_table(rf, "FOObar", properties)

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy