16
16
# Since this should not require frequent updates, we just store this
17
17
# out-of-line and check the unicode.rs file into git.
18
18
19
- import fileinput , re , os , sys
19
+ import fileinput
20
+ import re
21
+ import os
22
+ import sys
20
23
21
24
preamble = '''// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT
22
25
// file at the top-level directory of this distribution and at
36
39
37
40
UNICODE_VERSION = (13 , 0 , 0 )
38
41
39
- UNICODE_VERSION_NUMBER = "%s.%s.%s" % UNICODE_VERSION
42
+ UNICODE_VERSION_NUMBER = "%s.%s.%s" % UNICODE_VERSION
43
+
40
44
41
45
def escape_char (c ):
42
46
return "'\\ u{%x}'" % c
43
47
48
+
44
49
def fetch (f ):
45
50
if not os .path .exists (os .path .basename (f )):
46
51
if "emoji" in f :
@@ -55,6 +60,8 @@ def fetch(f):
55
60
exit (1 )
56
61
57
62
# Implementation from unicode-segmentation
63
+
64
+
58
65
def load_names (f , interestingprops ):
59
66
fetch (f )
60
67
normal_names = {}
@@ -76,17 +83,21 @@ def load_names(f, interestingprops):
76
83
normal_names [d_ch ] = d_name
77
84
return (normal_names , special_names )
78
85
86
+
79
87
SPACE_SYMBOL = ' '
80
88
CODEPOINT_SYMBOL = '@'
81
89
SPECIAL_SYMBOLS = ['-' , '0' , '1' , '2' , '3' , '4' , '5' , '6' , '7' , '8' , '9' ]
82
90
91
+
83
92
def tokenize (str , codepoint ):
84
93
name_sep = str
85
94
name_sep = name_sep .replace (codepoint , CODEPOINT_SYMBOL )
86
95
for symbol in SPECIAL_SYMBOLS :
87
- name_sep = name_sep .replace (symbol , SPACE_SYMBOL + symbol + SPACE_SYMBOL )
96
+ name_sep = name_sep .replace (
97
+ symbol , SPACE_SYMBOL + symbol + SPACE_SYMBOL )
88
98
return name_sep
89
99
100
+
90
101
def make_wordset (names ):
91
102
word_set = {}
92
103
word_set [SPACE_SYMBOL ] = SPACE_SYMBOL
@@ -97,6 +108,7 @@ def make_wordset(names):
97
108
word_set [word ] = word
98
109
return word_set
99
110
111
+
100
112
class WordIndex :
101
113
def __init__ (self , normal_names ):
102
114
word_set = make_wordset (normal_names )
@@ -123,7 +135,7 @@ def __init__(self, normal_names):
123
135
self .word_map = word_map
124
136
self .special_map = special_map
125
137
self .special_list = special_list
126
-
138
+
127
139
def encode (self , name , codepoint ):
128
140
name_sep = tokenize (name , codepoint )
129
141
word_list = name_sep .split (SPACE_SYMBOL )
@@ -143,16 +155,18 @@ def encode(self, name, codepoint):
143
155
encoded_sequence .append (self .word_map [SPACE_SYMBOL ])
144
156
name_build += SPACE_SYMBOL
145
157
if not name [len (name_build ):].startswith (word ):
146
- raise Exception ("Divergence! [%s] - [%s] vs [%s]" % (name , name [len (name_build ):], word ))
158
+ raise Exception (
159
+ "Divergence! [%s] - [%s] vs [%s]" % (name , name [len (name_build ):], word ))
147
160
name_build += word
148
161
encoded_sequence .append (word_idx )
149
162
# encoded_sequence.append(word)
150
163
last_is_special = word_is_special
151
164
if name_build != name :
152
165
raise Exception ("Different! [%s] vs [%s]" % (name , name_build ))
153
-
166
+
154
167
return encoded_sequence
155
168
169
+
156
170
def create_intervals (list ):
157
171
list .sort ()
158
172
in_group = False
@@ -167,8 +181,10 @@ def create_intervals(list):
167
181
in_group = False
168
182
return result
169
183
184
+
170
185
def create_normal_groups (normal_names ):
171
- normal_intervals = create_intervals ([int (key , 16 ) for key in normal_names .keys ()])
186
+ normal_intervals = create_intervals (
187
+ [int (key , 16 ) for key in normal_names .keys ()])
172
188
encoded_groups = []
173
189
for first , last in normal_intervals :
174
190
group_buffer = []
@@ -184,6 +200,7 @@ def create_normal_groups(normal_names):
184
200
encoded_groups .append ((first , last , group_buffer , pos_buffer ))
185
201
return encoded_groups
186
202
203
+
187
204
def create_special_groups (special_names ):
188
205
item_idx = 0
189
206
item_count = len (special_names )
@@ -198,9 +215,11 @@ def create_special_groups(special_names):
198
215
label = m1 .group (1 )
199
216
m2 = re2 .match (special_names [item_idx + 1 ][1 ])
200
217
if not m2 or m2 .group (1 ) != label :
201
- raise Exception ("Pair mismatch! [%s] vs [%s]" % (special_names [item_idx ], special_names [item_idx + 1 ]))
202
-
203
- result .append ((int (special_names [item_idx ][0 ], 16 ), int (special_names [item_idx + 1 ][0 ], 16 ), label ))
218
+ raise Exception ("Pair mismatch! [%s] vs [%s]" % (
219
+ special_names [item_idx ], special_names [item_idx + 1 ]))
220
+
221
+ result .append ((int (special_names [item_idx ][0 ], 16 ), int (
222
+ special_names [item_idx + 1 ][0 ], 16 ), label ))
204
223
item_idx += 2
205
224
continue
206
225
m3 = re3 .match (item_text )
@@ -211,25 +230,30 @@ def create_special_groups(special_names):
211
230
while try_item_idx < item_count and special_names [try_item_idx ][1 ] == item_text :
212
231
last_repeat_item_idx = try_item_idx
213
232
try_item_idx += 1
214
- result .append ((int (special_names [item_idx ][0 ], 16 ), int (special_names [last_repeat_item_idx ][0 ], 16 ), label ))
233
+ result .append ((int (special_names [item_idx ][0 ], 16 ), int (
234
+ special_names [last_repeat_item_idx ][0 ], 16 ), label ))
215
235
item_idx = last_repeat_item_idx + 1
216
236
continue
217
-
237
+
218
238
raise Exception ("Unexpected item: %s" % item_text )
219
239
return result
220
240
241
+
221
242
def write_enumeration_char_names (rf , encoded_groups ):
222
243
rf .write ("""
223
244
pub const ENUMERATION_CHAR_NAMES: &'static [(u32, u32, &'static [u16], &'static [u32])] = &[
224
245
""" )
225
246
for (first , last , group_buffer , pos_buffer ) in encoded_groups :
226
- rf .write ("\t (%d, %d, &%s, &%s),\n " % (first , last , group_buffer , pos_buffer ))
247
+ rf .write ("\t (%d, %d, &%s, &%s),\n " %
248
+ (first , last , group_buffer , pos_buffer ))
227
249
rf .write ("""];
228
250
""" )
229
251
252
+
230
253
def write_special_groups (rf , special_groups ):
231
254
rf .write ("""
232
255
#[allow(non_camel_case_types)]
256
+ #[derive(Copy, Clone, PartialEq, Eq)]
233
257
pub enum SpecialGroup {
234
258
""" )
235
259
for ((_ , _ , groupname )) in special_groups :
@@ -249,8 +273,27 @@ def write_special_groups(rf, special_groups):
249
273
if (idx + 1 ) % 2 == 0 :
250
274
rf .write ('\n ' )
251
275
rf .write ("""];
276
+
277
+ pub fn find_in_special_groups(ch: u32) -> Option<SpecialGroup> {
278
+ let record_idx = SPECIAL_GROUPS
279
+ .binary_search_by(|record| {
280
+ use std::cmp::Ordering;
281
+ if record.1 < ch {
282
+ Ordering::Less
283
+ } else if record.0 > ch {
284
+ Ordering::Greater
285
+ } else {
286
+ Ordering::Equal
287
+ }
288
+ })
289
+ .ok()?;
290
+ let group = SPECIAL_GROUPS[record_idx].2;
291
+ Some(group)
292
+ }
293
+
252
294
""" )
253
295
296
+
254
297
def write_word_table (rf , word_table ):
255
298
rf .write ("""
256
299
pub const ENUMERATION_WORD_TABLE: &'static [&'static str] = &[
@@ -262,8 +305,29 @@ def write_word_table(rf, word_table):
262
305
if (idx + 1 ) % 8 == 0 :
263
306
rf .write ('\n ' )
264
307
rf .write ("""];
308
+
309
+ pub fn find_in_enumerate_names(ch: u32) -> Option<&'static [u16]> {
310
+ let record_idx = ENUMERATION_CHAR_NAMES
311
+ .binary_search_by(|record| {
312
+ use std::cmp::Ordering;
313
+ if record.1 < ch {
314
+ Ordering::Less
315
+ } else if record.0 > ch {
316
+ Ordering::Greater
317
+ } else {
318
+ Ordering::Equal
319
+ }
320
+ })
321
+ .ok()?;
322
+ let offset = (ch - ENUMERATION_CHAR_NAMES[record_idx].0) as usize;
323
+ let index_slice = ENUMERATION_CHAR_NAMES[record_idx].2;
324
+ let offset_slice = ENUMERATION_CHAR_NAMES[record_idx].3;
325
+ let range = (offset_slice[offset] as usize)..(offset_slice[offset + 1] as usize);
326
+ Some(&index_slice[range])
327
+ }
265
328
""" )
266
329
330
+
267
331
def write_special_symbols (rf , word_index ):
268
332
rf .write ("""
269
333
pub const WORD_TABLE_INDEX_SPACE: u16 = %d;
@@ -279,12 +343,12 @@ def write_special_symbols(rf, word_index):
279
343
""" )
280
344
for (first , last ) in special_intervals :
281
345
rf .write ("\t \t %d..=%d => true,\n " % (first , last ))
282
- rf .write ("""
283
- _ => false,
346
+ rf .write ("""\t \t _ => false,
284
347
}
285
348
}
286
349
""" )
287
350
351
+
288
352
if __name__ == "__main__" :
289
353
r = "tables.rs"
290
354
if os .path .exists (r ):
@@ -302,9 +366,8 @@ def write_special_symbols(rf, word_index):
302
366
word_index = WordIndex (normal_names )
303
367
normal_encoded_groups = create_normal_groups (normal_names )
304
368
special_groups = create_special_groups (special_names )
305
-
369
+
306
370
write_enumeration_char_names (rf , normal_encoded_groups )
307
371
write_special_groups (rf , special_groups )
308
372
write_word_table (rf , word_index .word_list )
309
373
write_special_symbols (rf , word_index )
310
-
0 commit comments