Skip to content

Commit 9d0c1e0

Browse files
committed
Move update script over to Unicode 11; make it handle emoji data
1 parent 666eeed commit 9d0c1e0

File tree

1 file changed

+21
-5
lines changed

1 file changed

+21
-5
lines changed

scripts/unicode.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
# these are the surrogate codepoints, which are not valid rust characters
5555
surrogate_codepoints = (0xd800, 0xdfff)
5656

57-
UNICODE_VERSION = (10, 0, 0)
57+
UNICODE_VERSION = (11, 0, 0)
5858

5959
UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION
6060

@@ -63,8 +63,12 @@ def is_surrogate(n):
6363

6464
def fetch(f):
6565
if not os.path.exists(os.path.basename(f)):
66-
os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
67-
% (UNICODE_VERSION_NUMBER, f))
66+
if "emoji" in f:
67+
os.system("curl -O https://www.unicode.org/Public/emoji/%s.%s/%s"
68+
% (UNICODE_VERSION[0], UNICODE_VERSION[1], f))
69+
else:
70+
os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
71+
% (UNICODE_VERSION_NUMBER, f))
6872

6973
if not os.path.exists(os.path.basename(f)):
7074
sys.stderr.write("cannot load %s" % f)
@@ -266,7 +270,7 @@ def emit_break_module(f, break_table, break_cats, name):
266270
pub use self::%sCat::*;
267271
268272
#[allow(non_camel_case_types)]
269-
#[derive(Clone, Copy, PartialEq, Eq)]
273+
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
270274
pub enum %sCat {
271275
""" % (name, Name, Name))
272276

@@ -340,8 +344,15 @@ def emit_break_module(f, break_table, break_cats, name):
340344
grapheme_table = []
341345
for cat in grapheme_cats:
342346
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
347+
emoji_props = load_properties("emoji-data.txt", ["Extended_Pictographic"])
348+
grapheme_table.extend([(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]])
343349
grapheme_table.sort(key=lambda w: w[0])
344-
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()), "grapheme")
350+
last = -1
351+
for chars in grapheme_table:
352+
if chars[0] <= last:
353+
raise "Grapheme tables and Extended_Pictographic values overlap; need to store these separately!"
354+
last = chars[1]
355+
emit_break_module(rf, grapheme_table, list(grapheme_cats.keys()) + ["Extended_Pictographic"], "grapheme")
345356
rf.write("\n")
346357

347358
word_cats = load_properties("auxiliary/WordBreakProperty.txt", [])
@@ -351,6 +362,11 @@ def emit_break_module(f, break_table, break_cats, name):
351362
word_table.sort(key=lambda w: w[0])
352363
emit_break_module(rf, word_table, list(word_cats.keys()), "word")
353364

365+
# There are some emoji which are also ALetter, so this needs to be stored separately
366+
# For efficiency, we could still merge the two tables and produce an ALetterEP state
367+
emoji_table = [(x, y, "Extended_Pictographic") for (x, y) in emoji_props["Extended_Pictographic"]]
368+
emit_break_module(rf, emoji_table, ["Extended_Pictographic"], "emoji")
369+
354370
sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
355371
sentence_table = []
356372
for cat in sentence_cats:

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy