|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# |
| 3 | +# Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT |
| 4 | +# file at the top-level directory of this distribution and at |
| 5 | +# http://rust-lang.org/COPYRIGHT. |
| 6 | +# |
| 7 | +# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 8 | +# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 9 | +# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 10 | +# option. This file may not be copied, modified, or distributed |
| 11 | +# except according to those terms. |
| 12 | + |
| 13 | +# This script uses the following Unicode UCD data: |
| 14 | +# - emoji/emoji-data.txt |
| 15 | +# |
| 16 | +# Since this should not require frequent updates, we just store this |
| 17 | +# out-of-line and check the tables.rs file into git. |
| 18 | + |
| 19 | +import fileinput, re, os, sys, operator |
| 20 | + |
| 21 | +preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT |
| 22 | +// file at the top-level directory of this distribution and at |
| 23 | +// http://rust-lang.org/COPYRIGHT. |
| 24 | +// |
| 25 | +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 26 | +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 27 | +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 28 | +// option. This file may not be copied, modified, or distributed |
| 29 | +// except according to those terms. |
| 30 | +
|
| 31 | +// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly |
| 32 | +
|
| 33 | +#![allow(missing_docs, non_upper_case_globals, non_snake_case)] |
| 34 | +''' |
| 35 | + |
| 36 | +UNICODE_VERSION = (15, 0, 0) |
| 37 | + |
| 38 | +UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION |
| 39 | + |
| 40 | +# Download a UCD table file |
| 41 | +def fetch_unidata(f): |
| 42 | + if not os.path.exists(os.path.basename(f)): |
| 43 | + os.system("curl -O https://www.unicode.org/Public/%s/ucd/%s" |
| 44 | + % (UNICODE_VERSION_NUMBER, f)) |
| 45 | + |
| 46 | + if not os.path.exists(os.path.basename(f)): |
| 47 | + sys.stderr.write("cannot load %s" % f) |
| 48 | + exit(1) |
| 49 | + |
| 50 | +# Loads code point data from emoji-data.txt |
| 51 | +# Implementation from unicode-segmentation |
| 52 | +def load_emoji_properties(f): |
| 53 | + fetch_unidata(f) |
| 54 | + kinds = {} |
| 55 | + re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)") |
| 56 | + re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+) *#") |
| 57 | + |
| 58 | + for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")): |
| 59 | + kind = None |
| 60 | + d_lo = 0 |
| 61 | + d_hi = 0 |
| 62 | + m = re1.match(line) |
| 63 | + if m: |
| 64 | + d_lo = m.group(1) |
| 65 | + d_hi = m.group(1) |
| 66 | + kind = m.group(2).strip() |
| 67 | + else: |
| 68 | + m = re2.match(line) |
| 69 | + if m: |
| 70 | + d_lo = m.group(1) |
| 71 | + d_hi = m.group(2) |
| 72 | + kind = m.group(3).strip() |
| 73 | + else: |
| 74 | + continue |
| 75 | + d_lo = int(d_lo, 16) |
| 76 | + d_hi = int(d_hi, 16) |
| 77 | + if kind not in kinds: |
| 78 | + kinds[kind] = [] |
| 79 | + kinds[kind].append((d_lo, d_hi)) |
| 80 | + |
| 81 | + return kinds |
| 82 | + |
| 83 | +def format_table_content(f, content, indent): |
| 84 | + line = " "*indent |
| 85 | + first = True |
| 86 | + for chunk in content.split(","): |
| 87 | + if len(line) + len(chunk) < 98: |
| 88 | + if first: |
| 89 | + line += chunk |
| 90 | + else: |
| 91 | + line += ", " + chunk |
| 92 | + first = False |
| 93 | + else: |
| 94 | + f.write(line + ",\n") |
| 95 | + line = " "*indent + chunk |
| 96 | + f.write(line) |
| 97 | + |
| 98 | +def escape_char(c): |
| 99 | + if c == 'multi': |
| 100 | + return "\"<multiple code points>\"" |
| 101 | + return "'\\u{%x}'" % c |
| 102 | + |
| 103 | +def escape_char_list(l): |
| 104 | + line = "[" |
| 105 | + first = True |
| 106 | + for c in l: |
| 107 | + if first: |
| 108 | + line += escape_char(c) |
| 109 | + else: |
| 110 | + line += ", " + escape_char(c) |
| 111 | + first = False |
| 112 | + line += "]" |
| 113 | + return line |
| 114 | + |
| 115 | +def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True, |
| 116 | + pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True): |
| 117 | + pub_string = "const" |
| 118 | + if not is_const: |
| 119 | + pub_string = "let" |
| 120 | + if is_pub: |
| 121 | + pub_string = "pub " + pub_string |
| 122 | + f.write(" %s %s: %s = &[\n" % (pub_string, name, t_type)) |
| 123 | + data = "" |
| 124 | + first = True |
| 125 | + for dat in t_data: |
| 126 | + if not first: |
| 127 | + data += "," |
| 128 | + first = False |
| 129 | + data += pfun(dat) |
| 130 | + format_table_content(f, data, 8) |
| 131 | + f.write("\n ];\n\n") |
| 132 | + |
| 133 | +def emit_emoji_module(f): |
| 134 | + f.write("""#[cfg(feature = \"emoji\")] |
| 135 | +pub mod emoji {""") |
| 136 | + f.write(""" |
| 137 | +
|
| 138 | + #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)] |
| 139 | + #[allow(non_camel_case_types)] |
| 140 | + #[non_exhaustive] |
| 141 | + pub enum EmojiStatus { |
| 142 | + NonEmoji, |
| 143 | + NonEmojiButEmojiComponent, |
| 144 | + EmojiPresentation, |
| 145 | + EmojiModifierBase, |
| 146 | + EmojiPresentationAndModifierBase, |
| 147 | + EmojiOther, |
| 148 | + EmojiPresentationAndEmojiComponent, |
| 149 | + EmojiPresentationAndModifierAndEmojiComponent, |
| 150 | + EmojiOtherAndEmojiComponent, |
| 151 | + } |
| 152 | + #[inline] |
| 153 | + pub(crate) fn emoji_status(c: char) -> EmojiStatus { |
| 154 | + // FIXME: do we want to special case ASCII here? |
| 155 | + match c as usize { |
| 156 | + _ => super::util::bsearch_range_value_table(c, EMOJI_STATUS).unwrap() |
| 157 | + } |
| 158 | + } |
| 159 | + #[inline] |
| 160 | + pub(crate) fn is_emoji_status_for_emoji_char(s: EmojiStatus) -> bool { |
| 161 | + !matches!(s, EmojiStatus::NonEmoji | EmojiStatus::NonEmojiButEmojiComponent) |
| 162 | + } |
| 163 | + #[inline] |
| 164 | + pub(crate) fn is_emoji_status_for_emoji_component(s: EmojiStatus) -> bool { |
| 165 | + matches!(s, EmojiStatus::EmojiPresentationAndEmojiComponent | |
| 166 | + EmojiStatus::EmojiPresentationAndModifierAndEmojiComponent | |
| 167 | + EmojiStatus::EmojiOtherAndEmojiComponent) |
| 168 | + } |
| 169 | +""") |
| 170 | + |
| 171 | + f.write(" // Emoji status table:\n") |
| 172 | + emoji_status_table = load_emoji_properties("emoji/emoji-data.txt") |
| 173 | + # we combine things together here. |
| 174 | + |
| 175 | + # `Extended_Pictographic`` is only for future proof usages, we ignore it here. |
| 176 | + # emoji_prop_list = ["Emoji", "Emoji_Presentation", "Emoji_Modifier", "Emoji_Modifier_Base", "Emoji_Component", "Extended_Pictographic"] |
| 177 | + emoji_prop_list = ["Emoji", "Emoji_Presentation", "Emoji_Modifier", "Emoji_Modifier_Base", "Emoji_Component"] |
| 178 | + |
| 179 | + # need to skip surrogates because they're not representable by rust `char`s |
| 180 | + emoji_status_table["Surrogate"] = [(0xD800, 0xDFFF)] |
| 181 | + emoji_prop_list.append("Surrogate") |
| 182 | + |
| 183 | + emoji_prop_list_len = [len(emoji_status_table[x]) for x in emoji_prop_list] |
| 184 | + emoji_prop_count = len(emoji_prop_list) |
| 185 | + code_point_first = 0 |
| 186 | + code_point_last = 0x10FFFF |
| 187 | + emoji_prop_list_pos = [0 for _ in emoji_prop_list] |
| 188 | + cur_group_first = code_point_first |
| 189 | + emoji_table = [] |
| 190 | + def group_text(s): |
| 191 | + if s == "Surrogate": |
| 192 | + return "<Surrogate>" |
| 193 | + elif s == "": |
| 194 | + return "EmojiStatus::NonEmoji" |
| 195 | + elif s == "Emoji_Component": |
| 196 | + return "EmojiStatus::NonEmojiButEmojiComponent" |
| 197 | + elif s == "Emoji;Emoji_Presentation": |
| 198 | + return "EmojiStatus::EmojiPresentation" |
| 199 | + elif s == "Emoji;Emoji_Presentation;Emoji_Modifier_Base": |
| 200 | + return "EmojiStatus::EmojiPresentationAndModifierBase" |
| 201 | + elif s == "Emoji;Emoji_Modifier_Base": |
| 202 | + return "EmojiStatus::EmojiModifierBase" |
| 203 | + elif s == "Emoji": |
| 204 | + return "EmojiStatus::EmojiOther" |
| 205 | + elif s == "Emoji;Emoji_Presentation;Emoji_Component": |
| 206 | + return "EmojiStatus::EmojiPresentationAndEmojiComponent" |
| 207 | + elif s == "Emoji;Emoji_Presentation;Emoji_Modifier;Emoji_Component": |
| 208 | + return "EmojiStatus::EmojiPresentationAndModifierAndEmojiComponent" |
| 209 | + elif s == "Emoji;Emoji_Component": |
| 210 | + return "EmojiStatus::EmojiOtherAndEmojiComponent" |
| 211 | + else: |
| 212 | + return "EmojiStatus::NewCombination(\"" + s + "\")" |
| 213 | + while cur_group_first <= code_point_last: |
| 214 | + cur_group_props = [] |
| 215 | + cur_group_last = code_point_last |
| 216 | + for prop_list_idx in range(emoji_prop_count): |
| 217 | + if emoji_prop_list_pos[prop_list_idx] >= emoji_prop_list_len[prop_list_idx]: |
| 218 | + continue |
| 219 | + elif emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] > cur_group_first: |
| 220 | + cur_group_last = min(cur_group_last, emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] - 1) |
| 221 | + else: |
| 222 | + cur_group_props.append(emoji_prop_list[prop_list_idx]) |
| 223 | + cur_group_last = min(cur_group_last, emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][1]) |
| 224 | + cur_group_text = group_text(";".join(cur_group_props)) |
| 225 | + if cur_group_text != "<Surrogate>": |
| 226 | + emoji_table.append((cur_group_first, cur_group_last, cur_group_text)) |
| 227 | + for prop_list_idx in range(emoji_prop_count): |
| 228 | + if emoji_prop_list_pos[prop_list_idx] >= emoji_prop_list_len[prop_list_idx]: |
| 229 | + continue |
| 230 | + elif emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] > cur_group_first: |
| 231 | + continue |
| 232 | + else: |
| 233 | + if cur_group_last == emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][1]: |
| 234 | + emoji_prop_list_pos[prop_list_idx] += 1 |
| 235 | + cur_group_first = cur_group_last + 1 |
| 236 | + |
| 237 | + emit_table(f, "EMOJI_STATUS", emoji_table, "&'static [(char, char, EmojiStatus)]", is_pub=False, |
| 238 | + pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2])) |
| 239 | + f.write("}\n\n") |
| 240 | + |
| 241 | +def emit_util_mod(f): |
| 242 | + f.write(""" |
| 243 | +#[allow(dead_code)] |
| 244 | +pub mod util { |
| 245 | + use core::result::Result::{Ok, Err}; |
| 246 | +
|
| 247 | + #[inline] |
| 248 | + pub fn bsearch_table(c: char, r: &'static [char]) -> bool { |
| 249 | + r.binary_search(&c).is_ok() |
| 250 | + } |
| 251 | +
|
| 252 | + #[inline] |
| 253 | + pub fn bsearch_value_table<T: Copy>(c: char, r: &'static [(char, T)]) -> Option<T> { |
| 254 | + match r.binary_search_by_key(&c, |&(k, _)| k) { |
| 255 | + Ok(idx) => { |
| 256 | + let (_, v) = r[idx]; |
| 257 | + Some(v) |
| 258 | + } |
| 259 | + Err(_) => None |
| 260 | + } |
| 261 | + } |
| 262 | +
|
| 263 | + #[inline] |
| 264 | + pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool { |
| 265 | + use core::cmp::Ordering::{Equal, Less, Greater}; |
| 266 | + r.binary_search_by(|&(lo,hi)| { |
| 267 | + if lo <= c && c <= hi { Equal } |
| 268 | + else if hi < c { Less } |
| 269 | + else { Greater } |
| 270 | + }).is_ok() |
| 271 | + } |
| 272 | +
|
| 273 | + pub fn bsearch_range_value_table<T: Copy>(c: char, r: &'static [(char, char, T)]) -> Option<T> { |
| 274 | + use core::cmp::Ordering::{Equal, Less, Greater}; |
| 275 | + match r.binary_search_by(|&(lo, hi, _)| { |
| 276 | + if lo <= c && c <= hi { Equal } |
| 277 | + else if hi < c { Less } |
| 278 | + else { Greater } |
| 279 | + }) { |
| 280 | + Ok(idx) => { |
| 281 | + let (_, _, cat) = r[idx]; |
| 282 | + Some(cat) |
| 283 | + } |
| 284 | + Err(_) => None |
| 285 | + } |
| 286 | + } |
| 287 | +
|
| 288 | +} |
| 289 | +
|
| 290 | +""") |
| 291 | + |
| 292 | +if __name__ == "__main__": |
| 293 | + r = "tables.rs" |
| 294 | + if os.path.exists(r): |
| 295 | + os.remove(r) |
| 296 | + with open(r, "w") as rf: |
| 297 | + # write the file's preamble |
| 298 | + rf.write(preamble) |
| 299 | + |
| 300 | + rf.write(""" |
| 301 | +/// The version of [Unicode](http://www.unicode.org/) |
| 302 | +/// that this version of unicode-security is based on. |
| 303 | +pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s); |
| 304 | +
|
| 305 | +""" % UNICODE_VERSION) |
| 306 | + |
| 307 | + emit_util_mod(rf) |
| 308 | + ### emoji module |
| 309 | + emit_emoji_module(rf) |
0 commit comments