Skip to content

Commit 741303d

Browse files
committed
Add more comments.
1 parent 72cefff commit 741303d

File tree

1 file changed

+53
-7
lines changed

1 file changed

+53
-7
lines changed

scripts/unicode.py

Lines changed: 53 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,11 @@ def load_confusables(f):
148148
return confusables
149149

150150
def aliases():
151-
"""
152-
Fetch the shorthand aliases for each longhand Script name
153-
"""
151+
# This function is taken from the `unicode-script` crate. If significant
152+
# changes are introduced, update accordingly.
153+
154+
# Note that this file is in UCD directly, not security directory.
155+
# we use `fetch_unidata` function to download it.
154156
fetch_unidata("PropertyValueAliases.txt")
155157
longforms = {}
156158
shortforms = {}
@@ -170,6 +172,9 @@ def aliases():
170172
return (longforms, shortforms)
171173

172174
def load_scripts(f):
175+
# This function is taken from the `unicode-script` crate. If significant
176+
# changes are introduced, update accordingly.
177+
173178
(longforms, shortforms) = aliases()
174179
scripts = load_script_properties(f, [])
175180

@@ -235,31 +240,52 @@ def is_codepoint_identifier_allowed(c, identifier_allowed):
235240
return False
236241

237242
def load_rustc_mixedscript_confusables(f, identifier_allowed, scripts):
243+
# First, load all confusables data from confusables.txt
238244
confusables = load_confusables(f)
245+
246+
# The confusables.txt is reductive, means that it is intended to be used in
247+
# on the fly substitutions. The code points that didn't occur in the file can be
248+
# seen as substitutes to itself. So if the confusables.txt says A -> C, B -> C,
249+
# and implicitly C -> C, it means A <-> B, A <-> C, B <-> C are confusable.
250+
251+
# here we first make a dict that contains all As and Bs whose corresponding C is single code point.
239252
seekup_map = {}
240253
for item in confusables:
241254
d_proto_list = item[1]
242255
d_source = item[0]
243256
assert(len(d_proto_list) > 0)
244257
if len(d_proto_list) == 1:
245258
seekup_map[escape_char(d_source)] = d_proto_list
246-
# collect prototypes
259+
260+
# Here we're dividing all confusable lhs and rhs(prototype) operands of the substitution into equivalence classes.
261+
# Principally we'll be using the rhs operands as the representive element of its equivalence classes.
262+
# However some rhs operands are single code point, while some others are not.
263+
# Here we collect them separately into `codepoint_map` and `multicodepoint_map`.
247264
codepoint_map = {}
248265
multicodepoint_map = {}
249266
for item in confusables:
250267
d_source = item[0]
268+
# According to the RFC, we'll skip those code points that are restricted from identifier usage.
251269
if not is_codepoint_identifier_allowed(d_source, identifier_allowed):
252270
continue
253271
d_proto_list = item[1]
254272
if len(d_proto_list) == 1:
255273
d_proto = escape_char(d_proto_list[0])
274+
# we use the escaped representation of rhs as key to the dict when creating new equivalence class.
256275
if d_proto not in codepoint_map:
257276
codepoint_map[d_proto] = []
277+
# when we create new equivalence class, we'll check whether the representative element should be collected.
278+
# i.e. if it is not subject to substituion, and not restricted from identifier usage,
279+
# we collect it into the equivalence class.
258280
if d_proto not in seekup_map and is_codepoint_identifier_allowed(d_proto_list[0], identifier_allowed):
259281
codepoint_map[d_proto].append(d_proto_list[0])
282+
# we collect the original code point to be substituted into this list.
260283
codepoint_map[d_proto].append(d_source)
261284
else:
262285
d_protos = escape_char_list(d_proto_list)
286+
# difference in multi code point case: the rhs part is not directly usable, however we store it in
287+
# dict for further special examination between each lhs and this multi code point rhs.
288+
# and there's an extra level of tuple here.
263289
if d_protos not in multicodepoint_map:
264290
multicodepoint_map[d_protos] = (d_proto_list, [])
265291
multicodepoint_map[d_protos][1].append(d_source)
@@ -274,24 +300,33 @@ def confusable_entry_item(confusable, script, item_text, item):
274300
script_entry[item_text] = (item, [])
275301
return script_entry[item_text][1]
276302

277-
# between single charpoint that has single charpoint prototype
303+
# First let's examine the each code point having single code point prototype case.
278304
for _, source in codepoint_map.items():
279305
source_len = len(source)
306+
# Examine each pair in the equivalence class
280307
for i in range(0, source_len - 1):
281308
for j in range(i + 1, source_len):
282309
item_i, item_j = source[i], source[j]
283310
script_i, script_j = codepoint_script(item_i, scripts), codepoint_script(item_j, scripts)
311+
# If they're in the same script, just skip this pair.
284312
if script_i == script_j:
285313
continue
314+
# If `item_i` (the first) is not in a non-ignored script, and `item_j` (the second) is in a differnt one (maybe ignored),
315+
# this means that this usage of the `item_i` can be suspicious, when it occurs in a document that is written in `script_j`.
316+
# We'll consider it a mixed_script_confusable code point.
286317
if not is_script_ignored_in_mixedscript(script_i):
318+
# store it within the map, saving as much information as possible, for further investigation on the final results.
287319
confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append(item_j)
320+
# Do the same in reverse from `item_j` to `item_i`
288321
if not is_script_ignored_in_mixedscript(script_j):
289322
confusable_entry_item(mixedscript_confusable, script_j, escape_char(item_j), item_j).append(item_i)
290323

291-
# between single charpoint that has multi charpoint prototype
324+
# Then let's examine the each code point having multiple code point prototype case.
325+
# We'll check between the code points that shares the same prototype
292326
for _, proto_lst_and_source in multicodepoint_map.items():
293327
source = proto_lst_and_source[1]
294328
source_len = len(source)
329+
# This is basically the same as the single code point case.
295330
for i in range(0, source_len - 1):
296331
for j in range(i + 1, source_len):
297332
item_i, item_j = source[i], source[j]
@@ -304,10 +339,11 @@ def confusable_entry_item(confusable, script, item_text, item):
304339
confusable_entry_item(mixedscript_confusable, script_j, escape_char(item_j), item_j).append(item_i)
305340

306341
mixedscript_confusable_unresolved = {}
307-
# single charpoint that has multi charpoint prototype and its prototype
342+
# We'll also check between each code points and its multiple codepoint prototype
308343
for _, proto_lst_and_source in multicodepoint_map.items():
309344
proto_lst = proto_lst_and_source[0]
310345
proto_lst_can_be_part_of_identifier = True
346+
# If the prototype contains one or more restricted code point, then we skip it.
311347
for c in proto_lst:
312348
if not is_codepoint_identifier_allowed(c, identifier_allowed):
313349
proto_lst_can_be_part_of_identifier = False
@@ -318,15 +354,25 @@ def confusable_entry_item(confusable, script, item_text, item):
318354
source_len = len(source)
319355
for i in range(0, source_len):
320356
item_i = source[i]
357+
# So here we're just checking whether the single code point should be considered confusable.
321358
script_i = codepoint_script(item_i, scripts)
359+
# If it's in ignored script, we don't need to do anything here.
322360
if is_script_ignored_in_mixedscript(script_i):
323361
continue
362+
# Here're some rules on examining whether the single code point should be considered confusable.
363+
# The principle is that, when subsitution happens, no new non-ignored script are introduced, and its
364+
# own script is not lost.
324365
processed, should_add = process_mixedscript_single_to_multi(item_i, script_i, proto_lst, scripts)
325366
if should_add:
326367
assert(processed)
368+
# Mark the single code point as confusable.
327369
confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append('multi')
328370
if processed:
371+
# Finished dealing with this code point.
329372
continue
373+
# If it's not processed we must be dealing with a newer version Unicode data, which introduced some significant
374+
# changes. We don't throw an exception here, instead we collect it into a table for debugging purpose, and throw
375+
# an exception after we returned and printed the table out.
330376
proto_lst_text = escape_char_list(proto_lst)
331377
if not proto_lst_text in mixedscript_confusable_unresolved:
332378
mixedscript_confusable_unresolved[proto_lst_text] = (proto_lst, [])

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy