Skip to content

Commit 0609dae

Browse files
committed
Move to using bit sets for ScriptExtension
1 parent 1057462 commit 0609dae

File tree

4 files changed

+1778
-2429
lines changed

4 files changed

+1778
-2429
lines changed

Cargo.toml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@ exclude = [ "target/*", "Cargo.lock", "scripts/tmp", "*.txt" ]
2020
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
2121

2222
[features]
23-
with_std = []
24-
default_features = ["with_std"]
2523
rustc-dep-of-std = ['std', 'core', 'compiler_builtins']
2624

2725
[dependencies]

scripts/unicode.py

Lines changed: 57 additions & 120 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
3636
3737
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
38+
39+
use super::ScriptExtension;
3840
'''
3941

4042
UNICODE_VERSION = (12, 0, 0)
@@ -183,44 +185,68 @@ def emit_search(f):
183185
}
184186
""")
185187

186-
def emit_enums(f, script_list, extension_list, longforms, intersections):
188+
def emit_enums(f, script_list, extension_list, longforms):
187189
"""
188190
Emit the Script and ScriptExtension enums as well as any related utility functions
189191
"""
192+
190193
f.write("""
191194
#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)]
192195
#[non_exhaustive]
193196
#[allow(non_camel_case_types)]
197+
#[repr(u8)]
194198
/// A value of the `Script` property
195199
pub enum Script {
196200
/// Unknown script
197-
Unknown,
201+
Unknown = 0xFF,
202+
/// Zyyy
203+
Common = 0xFE,
204+
/// Zinh,
205+
Inherited = 0xFD,
198206
""")
199-
for script in script_list:
200-
f.write(" /// %s\n %s,\n" % (script, longforms[script]))
201-
f.write("""}
202-
#[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)]
203-
#[non_exhaustive]
204-
/// A value for the `Script_Extension` property
205-
///
206-
/// [`ScriptExtension`] is one or more [`Script`]
207-
///
208-
/// This is essentially an optimized version of `Vec<Script>`,
209-
/// optimized by script sets and intersections actually present in Unicode.
210-
pub enum ScriptExtension {
211-
/// A single script
212-
Single(Script),
207+
for (i, script) in enumerate(script_list):
208+
f.write(" /// %s\n %s = %s,\n" % (script, longforms[script], i))
209+
f.write("}\n")
210+
f.write("pub const NEXT_SCRIPT: u8 = %s;" % len(script_list))
211+
f.write("""
212+
213+
pub mod script_extensions {
214+
use crate::ScriptExtension;
215+
pub const COMMON: ScriptExtension = ScriptExtension::new_common();
216+
pub const INHERITED: ScriptExtension = ScriptExtension::new_inherited();
217+
pub const UNKNOWN: ScriptExtension = ScriptExtension::new_unknown();
213218
""")
219+
for (i, script) in enumerate(script_list):
220+
first = 0
221+
second = 0
222+
third = 0
223+
if i < 64:
224+
first = hex(1 << i).replace("L", "")
225+
elif i < 128:
226+
second = hex(1 << (i - 64)).replace("L", "")
227+
else:
228+
third = hex(1 << (i - 128)).replace("L", "")
229+
f.write(" /// %s\n pub const %s: ScriptExtension = ScriptExtension::new(%s, %s, %s);\n" %
230+
(longforms[script], longforms[script].upper(), first, second, third))
231+
if script != longforms[script]:
232+
f.write(" /// %s\n pub const %s: ScriptExtension = %s;\n" %
233+
(longforms[script], script.upper(), longforms[script].upper()))
214234
for ext in extension_list:
215235
longform = ", ".join([longforms[s] for s in ext])
216-
f.write(" /// %s\n %s,\n" % (longform, "".join(ext)))
236+
name = "_".join([s.upper() for s in ext])
237+
expr = ext[0].upper()
238+
for e in ext[1:]:
239+
expr = "%s.intersection(%s)" % (expr, e.upper())
240+
f.write(" /// %s\n pub const %s: ScriptExtension = %s;\n" % (longform, name, expr))
217241
f.write("""}
218242
219243
impl Script {
220244
#[inline]
221245
pub(crate) fn inner_full_name(self) -> &'static str {
222246
match self {
223247
Script::Unknown => "Unknown",
248+
Script::Common => "Common",
249+
Script::Inherited => "Inherited",
224250
""")
225251
for script in script_list:
226252
f.write(" Script::%s => \"%s\",\n" % (longforms[script], longforms[script]))
@@ -231,119 +257,29 @@ def emit_enums(f, script_list, extension_list, longforms, intersections):
231257
pub(crate) fn inner_short_name(self) -> &'static str {
232258
match self {
233259
Script::Unknown => "",
260+
Script::Common => "Zyyy",
261+
Script::Inherited => "Zinh",
234262
""")
235263
for script in script_list:
236264
f.write(" Script::%s => \"%s\",\n" % (longforms[script], script))
237265
f.write(""" }
238266
}
239-
}
240-
241-
impl ScriptExtension {
242-
#[inline]
243-
#[cfg(feature = "with_std")]
244-
pub(crate) fn inner_scripts(self) -> Vec<Script> {
245-
match self {
246-
ScriptExtension::Single(s) => vec![s],
247-
""")
248-
for ext in extension_list:
249-
scripts = ", ".join(["Script::%s" % longforms[s] for s in ext])
250-
f.write(" %s => vec![%s],\n" % (extension_name(ext), scripts))
251-
f.write(""" _ => unreachable!()
252-
}
253-
}
254-
255-
#[inline]
256-
pub(crate) fn inner_contains_script(self, other: Script) -> bool {
257-
match self {
258-
ScriptExtension::Single(s) => s == other,
259-
""")
260-
for ext in extension_list:
261-
scripts = " || ".join(["other == Script::%s" % longforms[s] for s in ext])
262-
f.write(" %s => %s,\n" % (extension_name(ext), scripts))
263-
f.write(""" }
264-
}
265267
266268
#[inline]
267-
pub(crate) fn inner_intersect(self, other: Self) -> Self {
268-
match (self, other) {
269-
(ScriptExtension::Single(Script::Unknown), _) |
270-
(_, ScriptExtension::Single(Script::Unknown)) => ScriptExtension::Single(Script::Unknown),
271-
(a, b) if a == b => a,
272-
(ScriptExtension::Single(Script::Common), a) |
273-
(ScriptExtension::Single(Script::Inherited), a) |
274-
(a, ScriptExtension::Single(Script::Common)) |
275-
(a, ScriptExtension::Single(Script::Inherited)) => a,
276-
(ScriptExtension::Single(s), o) | (o, ScriptExtension::Single(s)) if o.inner_contains_script(s) => ScriptExtension::Single(s),
269+
pub(crate) fn for_integer(value: u8) -> Self {
270+
match value {
277271
""")
278-
for (e1, e2, i) in intersections:
279-
f.write(" (%s, %s) => %s,\n" % (extension_name(e1), extension_name(e2), extension_name(i, longforms)))
280-
f.write(""" _ => ScriptExtension::Single(Script::Unknown),
272+
for (i, script) in enumerate(script_list):
273+
f.write(" %s => Script::%s,\n" % (i, longforms[script]))
274+
f.write(""" _ => unreachable!(),
281275
}
282276
}
283277
}
284278
""")
285279

286-
287-
def compute_intersections_elements(extension_list):
288-
"""
289-
Compute all intersections between the script extensions.
290-
This will add new elements to extension_list, be sure to call it first!
291-
"""
292-
293-
# This is the only third-level intersection
294-
# It's easier to hardcode things here rather than
295-
# do the below calculation in a loop
296-
extension_list.append(['Deva', 'Knda', 'Tirh'])
297-
intersections = []
298-
# Some intersections will not exist in extension_list and we'll need to add them
299-
new_elements = []
300-
sets = [(e, set(e)) for e in extension_list]
301-
for (e1, s1) in sets:
302-
for (e2, s2) in sets:
303-
if e1 == e2:
304-
continue
305-
intersection = s1.intersection(s2)
306-
if len(intersection) > 0:
307-
intersection = [i for i in intersection]
308-
intersection.sort()
309-
if len(intersection) > 1 and intersection not in extension_list and intersection not in new_elements:
310-
new_elements.append(intersection)
311-
if (e1, e2, intersection) not in intersections:
312-
intersections.append((e1, e2, intersection))
313-
extension_list.extend(new_elements)
314-
315-
# We now go through the newly added second-level extension values and calculate their intersections
316-
# with the original set and each other
317-
new_sets = [(e, set(e)) for e in new_elements]
318-
sets = [(e, set(e)) for e in extension_list]
319-
for (e1, s1) in new_sets:
320-
for (e2, s2) in sets:
321-
if e1 == e2:
322-
continue
323-
intersection = s1.intersection(s2)
324-
if len(intersection) > 0:
325-
intersection = [i for i in intersection]
326-
intersection.sort()
327-
if len(intersection) > 1 and intersection not in extension_list:
328-
raise "Found new third-level intersection, please hardcode it"
329-
# The previous routine would automatically get both versions
330-
# of an intersection because it would iterate each pair in both orders,
331-
# but here we're working on an asymmetric pair, so we insert both in order to not
332-
# miss anything
333-
if (e1, e2, intersection) not in intersections:
334-
intersections.append((e1, e2, intersection))
335-
if (e2, e1, intersection) not in intersections:
336-
intersections.append((e2, e1, intersection))
337-
338-
intersections.sort()
339-
return intersections
340-
341-
def extension_name(ext, longforms={}):
280+
def extension_name(ext):
342281
"""Get the rust source for a given ScriptExtension"""
343-
if len(ext) == 1:
344-
return "ScriptExtension::Single(Script::%s)" % longforms[ext[0]]
345-
else:
346-
return "ScriptExtension::%s" % "".join(ext)
282+
return "script_extensions::%s" % "_".join([e.upper() for e in ext])
347283

348284

349285

@@ -370,8 +306,10 @@ def extension_name(ext, longforms={}):
370306
script_list = []
371307

372308
for script in scripts:
373-
script_list.append(shortforms[script])
309+
if script not in ["Common", "Unknown", "Inherited"]:
310+
script_list.append(shortforms[script])
374311
script_table.extend([(x, y, shortforms[script]) for (x, y) in scripts[script]])
312+
script_list.sort()
375313
script_table.sort(key=lambda w: w[0])
376314

377315

@@ -389,14 +327,13 @@ def extension_name(ext, longforms={}):
389327
extension_table.extend([(x, y, output_ext) for (x, y) in extensions[ext]])
390328
extension_table.sort(key=lambda w: w[0])
391329

392-
intersections = compute_intersections_elements(extension_list)
393330

394-
emit_enums(rf, script_list, extension_list, longforms, intersections)
331+
emit_enums(rf, script_list, extension_list, longforms)
395332
emit_search(rf)
396333

397334
emit_table(rf, "SCRIPTS", script_table, t_type = "&'static [(char, char, Script)]",
398335
is_pub=False , pfun=lambda x: "(%s,%s, Script::%s)" % (escape_char(x[0]), escape_char(x[1]), longforms[x[2]]))
399336
emit_table(rf, "SCRIPT_EXTENSIONS", extension_table, t_type = "&'static [(char, char, ScriptExtension)]",
400-
is_pub=False , pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), extension_name(x[2], longforms)))
337+
is_pub=False , pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), extension_name(x[2])))
401338

402339
# emit_table(rf, "FOObar", properties)

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy