Skip to content

Commit dfb02a6

Browse files
committed
Implement emoji properties.
1 parent 1dd8fc9 commit dfb02a6

File tree

4 files changed

+870
-6
lines changed

4 files changed

+870
-6
lines changed

Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,8 @@ name = "unicode-properties"
33
version = "0.1.0"
44
edition = "2021"
55

6+
[features]
7+
emoji = []
8+
default = ["emoji"]
9+
610
[dependencies]

scripts/unicode.py

Lines changed: 309 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,309 @@
1+
#!/usr/bin/env python3
2+
#
3+
# Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT
4+
# file at the top-level directory of this distribution and at
5+
# http://rust-lang.org/COPYRIGHT.
6+
#
7+
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8+
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9+
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10+
# option. This file may not be copied, modified, or distributed
11+
# except according to those terms.
12+
13+
# This script uses the following Unicode UCD data:
14+
# - emoji/emoji-data.txt
15+
#
16+
# Since this should not require frequent updates, we just store this
17+
# out-of-line and check the tables.rs file into git.
18+
19+
import fileinput, re, os, sys, operator
20+
21+
preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
22+
// file at the top-level directory of this distribution and at
23+
// http://rust-lang.org/COPYRIGHT.
24+
//
25+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
26+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
27+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
28+
// option. This file may not be copied, modified, or distributed
29+
// except according to those terms.
30+
31+
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
32+
33+
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
34+
'''
35+
36+
UNICODE_VERSION = (15, 0, 0)
37+
38+
UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION
39+
40+
# Download a UCD table file
41+
def fetch_unidata(f):
42+
if not os.path.exists(os.path.basename(f)):
43+
os.system("curl -O https://www.unicode.org/Public/%s/ucd/%s"
44+
% (UNICODE_VERSION_NUMBER, f))
45+
46+
if not os.path.exists(os.path.basename(f)):
47+
sys.stderr.write("cannot load %s" % f)
48+
exit(1)
49+
50+
# Loads code point data from emoji-data.txt
51+
# Implementation from unicode-segmentation
52+
def load_emoji_properties(f):
53+
fetch_unidata(f)
54+
kinds = {}
55+
re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)")
56+
re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+) *#")
57+
58+
for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")):
59+
kind = None
60+
d_lo = 0
61+
d_hi = 0
62+
m = re1.match(line)
63+
if m:
64+
d_lo = m.group(1)
65+
d_hi = m.group(1)
66+
kind = m.group(2).strip()
67+
else:
68+
m = re2.match(line)
69+
if m:
70+
d_lo = m.group(1)
71+
d_hi = m.group(2)
72+
kind = m.group(3).strip()
73+
else:
74+
continue
75+
d_lo = int(d_lo, 16)
76+
d_hi = int(d_hi, 16)
77+
if kind not in kinds:
78+
kinds[kind] = []
79+
kinds[kind].append((d_lo, d_hi))
80+
81+
return kinds
82+
83+
def format_table_content(f, content, indent):
84+
line = " "*indent
85+
first = True
86+
for chunk in content.split(","):
87+
if len(line) + len(chunk) < 98:
88+
if first:
89+
line += chunk
90+
else:
91+
line += ", " + chunk
92+
first = False
93+
else:
94+
f.write(line + ",\n")
95+
line = " "*indent + chunk
96+
f.write(line)
97+
98+
def escape_char(c):
99+
if c == 'multi':
100+
return "\"<multiple code points>\""
101+
return "'\\u{%x}'" % c
102+
103+
def escape_char_list(l):
104+
line = "["
105+
first = True
106+
for c in l:
107+
if first:
108+
line += escape_char(c)
109+
else:
110+
line += ", " + escape_char(c)
111+
first = False
112+
line += "]"
113+
return line
114+
115+
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
116+
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
117+
pub_string = "const"
118+
if not is_const:
119+
pub_string = "let"
120+
if is_pub:
121+
pub_string = "pub " + pub_string
122+
f.write(" %s %s: %s = &[\n" % (pub_string, name, t_type))
123+
data = ""
124+
first = True
125+
for dat in t_data:
126+
if not first:
127+
data += ","
128+
first = False
129+
data += pfun(dat)
130+
format_table_content(f, data, 8)
131+
f.write("\n ];\n\n")
132+
133+
def emit_emoji_module(f):
134+
f.write("""#[cfg(feature = \"emoji\")]
135+
pub mod emoji {""")
136+
f.write("""
137+
138+
#[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)]
139+
#[allow(non_camel_case_types)]
140+
#[non_exhaustive]
141+
pub enum EmojiStatus {
142+
NonEmoji,
143+
NonEmojiButEmojiComponent,
144+
EmojiPresentation,
145+
EmojiModifierBase,
146+
EmojiPresentationAndModifierBase,
147+
EmojiOther,
148+
EmojiPresentationAndEmojiComponent,
149+
EmojiPresentationAndModifierAndEmojiComponent,
150+
EmojiOtherAndEmojiComponent,
151+
}
152+
#[inline]
153+
pub(crate) fn emoji_status(c: char) -> EmojiStatus {
154+
// FIXME: do we want to special case ASCII here?
155+
match c as usize {
156+
_ => super::util::bsearch_range_value_table(c, EMOJI_STATUS).unwrap()
157+
}
158+
}
159+
#[inline]
160+
pub(crate) fn is_emoji_status_for_emoji_char(s: EmojiStatus) -> bool {
161+
!matches!(s, EmojiStatus::NonEmoji | EmojiStatus::NonEmojiButEmojiComponent)
162+
}
163+
#[inline]
164+
pub(crate) fn is_emoji_status_for_emoji_component(s: EmojiStatus) -> bool {
165+
matches!(s, EmojiStatus::EmojiPresentationAndEmojiComponent |
166+
EmojiStatus::EmojiPresentationAndModifierAndEmojiComponent |
167+
EmojiStatus::EmojiOtherAndEmojiComponent)
168+
}
169+
""")
170+
171+
f.write(" // Emoji status table:\n")
172+
emoji_status_table = load_emoji_properties("emoji/emoji-data.txt")
173+
# we combine things together here.
174+
175+
# `Extended_Pictographic`` is only for future proof usages, we ignore it here.
176+
# emoji_prop_list = ["Emoji", "Emoji_Presentation", "Emoji_Modifier", "Emoji_Modifier_Base", "Emoji_Component", "Extended_Pictographic"]
177+
emoji_prop_list = ["Emoji", "Emoji_Presentation", "Emoji_Modifier", "Emoji_Modifier_Base", "Emoji_Component"]
178+
179+
# need to skip surrogates because they're not representable by rust `char`s
180+
emoji_status_table["Surrogate"] = [(0xD800, 0xDFFF)]
181+
emoji_prop_list.append("Surrogate")
182+
183+
emoji_prop_list_len = [len(emoji_status_table[x]) for x in emoji_prop_list]
184+
emoji_prop_count = len(emoji_prop_list)
185+
code_point_first = 0
186+
code_point_last = 0x10FFFF
187+
emoji_prop_list_pos = [0 for _ in emoji_prop_list]
188+
cur_group_first = code_point_first
189+
emoji_table = []
190+
def group_text(s):
191+
if s == "Surrogate":
192+
return "<Surrogate>"
193+
elif s == "":
194+
return "EmojiStatus::NonEmoji"
195+
elif s == "Emoji_Component":
196+
return "EmojiStatus::NonEmojiButEmojiComponent"
197+
elif s == "Emoji;Emoji_Presentation":
198+
return "EmojiStatus::EmojiPresentation"
199+
elif s == "Emoji;Emoji_Presentation;Emoji_Modifier_Base":
200+
return "EmojiStatus::EmojiPresentationAndModifierBase"
201+
elif s == "Emoji;Emoji_Modifier_Base":
202+
return "EmojiStatus::EmojiModifierBase"
203+
elif s == "Emoji":
204+
return "EmojiStatus::EmojiOther"
205+
elif s == "Emoji;Emoji_Presentation;Emoji_Component":
206+
return "EmojiStatus::EmojiPresentationAndEmojiComponent"
207+
elif s == "Emoji;Emoji_Presentation;Emoji_Modifier;Emoji_Component":
208+
return "EmojiStatus::EmojiPresentationAndModifierAndEmojiComponent"
209+
elif s == "Emoji;Emoji_Component":
210+
return "EmojiStatus::EmojiOtherAndEmojiComponent"
211+
else:
212+
return "EmojiStatus::NewCombination(\"" + s + "\")"
213+
while cur_group_first <= code_point_last:
214+
cur_group_props = []
215+
cur_group_last = code_point_last
216+
for prop_list_idx in range(emoji_prop_count):
217+
if emoji_prop_list_pos[prop_list_idx] >= emoji_prop_list_len[prop_list_idx]:
218+
continue
219+
elif emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] > cur_group_first:
220+
cur_group_last = min(cur_group_last, emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] - 1)
221+
else:
222+
cur_group_props.append(emoji_prop_list[prop_list_idx])
223+
cur_group_last = min(cur_group_last, emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][1])
224+
cur_group_text = group_text(";".join(cur_group_props))
225+
if cur_group_text != "<Surrogate>":
226+
emoji_table.append((cur_group_first, cur_group_last, cur_group_text))
227+
for prop_list_idx in range(emoji_prop_count):
228+
if emoji_prop_list_pos[prop_list_idx] >= emoji_prop_list_len[prop_list_idx]:
229+
continue
230+
elif emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] > cur_group_first:
231+
continue
232+
else:
233+
if cur_group_last == emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][1]:
234+
emoji_prop_list_pos[prop_list_idx] += 1
235+
cur_group_first = cur_group_last + 1
236+
237+
emit_table(f, "EMOJI_STATUS", emoji_table, "&'static [(char, char, EmojiStatus)]", is_pub=False,
238+
pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]))
239+
f.write("}\n\n")
240+
241+
def emit_util_mod(f):
242+
f.write("""
243+
#[allow(dead_code)]
244+
pub mod util {
245+
use core::result::Result::{Ok, Err};
246+
247+
#[inline]
248+
pub fn bsearch_table(c: char, r: &'static [char]) -> bool {
249+
r.binary_search(&c).is_ok()
250+
}
251+
252+
#[inline]
253+
pub fn bsearch_value_table<T: Copy>(c: char, r: &'static [(char, T)]) -> Option<T> {
254+
match r.binary_search_by_key(&c, |&(k, _)| k) {
255+
Ok(idx) => {
256+
let (_, v) = r[idx];
257+
Some(v)
258+
}
259+
Err(_) => None
260+
}
261+
}
262+
263+
#[inline]
264+
pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
265+
use core::cmp::Ordering::{Equal, Less, Greater};
266+
r.binary_search_by(|&(lo,hi)| {
267+
if lo <= c && c <= hi { Equal }
268+
else if hi < c { Less }
269+
else { Greater }
270+
}).is_ok()
271+
}
272+
273+
pub fn bsearch_range_value_table<T: Copy>(c: char, r: &'static [(char, char, T)]) -> Option<T> {
274+
use core::cmp::Ordering::{Equal, Less, Greater};
275+
match r.binary_search_by(|&(lo, hi, _)| {
276+
if lo <= c && c <= hi { Equal }
277+
else if hi < c { Less }
278+
else { Greater }
279+
}) {
280+
Ok(idx) => {
281+
let (_, _, cat) = r[idx];
282+
Some(cat)
283+
}
284+
Err(_) => None
285+
}
286+
}
287+
288+
}
289+
290+
""")
291+
292+
if __name__ == "__main__":
293+
r = "tables.rs"
294+
if os.path.exists(r):
295+
os.remove(r)
296+
with open(r, "w") as rf:
297+
# write the file's preamble
298+
rf.write(preamble)
299+
300+
rf.write("""
301+
/// The version of [Unicode](http://www.unicode.org/)
302+
/// that this version of unicode-security is based on.
303+
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
304+
305+
""" % UNICODE_VERSION)
306+
307+
emit_util_mod(rf)
308+
### emoji module
309+
emit_emoji_module(rf)

src/lib.rs

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,33 @@
1-
#[cfg(test)]
2-
mod tests {
3-
#[test]
4-
fn it_works() {
5-
let result = 2 + 2;
6-
assert_eq!(result, 4);
1+
#[rustfmt::skip]
2+
mod tables;
3+
4+
#[cfg(feature = "emoji")]
5+
mod emoji {
6+
pub use crate::tables::emoji::EmojiStatus;
7+
8+
pub trait UnicodeEmoji: Sized {
9+
fn emoji_status(self) -> EmojiStatus;
10+
11+
fn is_emoji_char(self) -> bool {
12+
crate::tables::emoji::is_emoji_status_for_emoji_char(self.emoji_status())
13+
}
14+
15+
fn is_emoji_component(self) -> bool {
16+
crate::tables::emoji::is_emoji_status_for_emoji_component(self.emoji_status())
17+
}
18+
}
19+
20+
impl UnicodeEmoji for char {
21+
fn emoji_status(self) -> EmojiStatus {
22+
crate::tables::emoji::emoji_status(self)
23+
}
724
}
825
}
26+
27+
pub use tables::UNICODE_VERSION;
28+
29+
#[cfg(feature = "emoji")]
30+
pub use emoji::UnicodeEmoji;
31+
32+
#[cfg(feature = "emoji")]
33+
pub use emoji::EmojiStatus;

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy