Skip to content

Commit 31bd869

Browse files
committed
Add more comments to document the details.
1 parent a18eb9c commit 31bd869

File tree

1 file changed

+40
-1
lines changed

1 file changed

+40
-1
lines changed

scripts/unicode.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,15 @@
1212

1313
# This script uses the following Unicode security tables:
1414
# - IdentifierStatus.txt
15+
# - IdentifierType.txt
16+
# - PropertyValueAliases.txt
17+
# - confusables.txt
1518
# - ReadMe.txt
19+
# This script also uses the following Unicode UCD data:
20+
# - Scripts.txt
1621
#
1722
# Since this should not require frequent updates, we just store this
18-
# out-of-line and check the unicode.rs file into git.
23+
# out-of-line and check the tables.rs file into git.
1924

2025
import fileinput, re, os, sys, operator
2126

@@ -38,6 +43,7 @@
3843

3944
UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION
4045

46+
# Download a Unicode security table file
4147
def fetch(f):
4248
if not os.path.exists(os.path.basename(f)):
4349
os.system("curl -O http://www.unicode.org/Public/security/%s/%s"
@@ -47,6 +53,7 @@ def fetch(f):
4753
sys.stderr.write("cannot load %s\n" % f)
4854
exit(1)
4955

56+
# Download a UCD table file
5057
def fetch_unidata(f):
5158
if not os.path.exists(os.path.basename(f)):
5259
os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s"
@@ -56,6 +63,8 @@ def fetch_unidata(f):
5663
sys.stderr.write("cannot load %s" % f)
5764
exit(1)
5865

66+
# Loads code point data from IdentifierStatus.txt and
67+
# IdentifierType.txt
5968
# Implementation from unicode-segmentation
6069
def load_properties(f, interestingprops = None):
6170
fetch(f)
@@ -90,6 +99,7 @@ def load_properties(f, interestingprops = None):
9099

91100
return props
92101

102+
# Loads script data from Scripts.txt
93103
def load_script_properties(f, interestingprops):
94104
fetch_unidata(f)
95105
props = {}
@@ -125,6 +135,7 @@ def load_script_properties(f, interestingprops):
125135

126136
return props
127137

138+
# Loads confusables data from confusables.txt
128139
def load_confusables(f):
129140
fetch(f)
130141
confusables = []
@@ -147,6 +158,7 @@ def load_confusables(f):
147158

148159
return confusables
149160

161+
# Loads Unicode script name correspondence from PropertyValueAliases.txt
150162
def aliases():
151163
# This function is taken from the `unicode-script` crate. If significant
152164
# changes are introduced, update accordingly.
@@ -171,6 +183,7 @@ def aliases():
171183

172184
return (longforms, shortforms)
173185

186+
# Loads Unicode script name list and correspondence mapping
174187
def load_scripts(f):
175188
# This function is taken from the `unicode-script` crate. If significant
176189
# changes are introduced, update accordingly.
@@ -192,6 +205,16 @@ def load_scripts(f):
192205
def is_script_ignored_in_mixedscript(source):
193206
return source == 'Zinh' or source == 'Zyyy' or source == 'Zzzz'
194207

208+
# When a codepoint's prototype consists of multiple codepoints.
209+
# The situation is more complex. Here we make up a few rules
210+
# to cover all the cases in confusables.txt .
211+
# The principle is that when replacing the original codepoint with its prototype.
212+
# Neither a "non-ignored script" appears nor it disappears.
213+
#
214+
# We make up several rules to cover the cases occurred within confusables.txt
215+
# Return True, True when we want to consider it confusable,
216+
# and return True, False when we want to consider it non-confusable.
217+
# and return False, _ when new not-yet-processed cases are added in future Unicode versions.
195218
def process_mixedscript_single_to_multi(item_i, script_i, proto_lst, scripts):
196219
script_lst = script_list(proto_lst, scripts)
197220
script_lst.sort()
@@ -239,6 +262,21 @@ def is_codepoint_identifier_allowed(c, identifier_allowed):
239262
return True
240263
return False
241264

265+
# This function load and generates a table of all the confusable characters.
266+
# It returns a pair consists of a `mixedscript_confusable` table and a
267+
# `mixedscript_confusable_unresolved` table.
268+
# The `mixedscript_confusable` is a dict, its keys are Unicode script names, and each
269+
# entry has a value of a inner dict. The inner dict's keys are confusable code points
270+
# converted to string with the `escape_char` function, and its values are pairs.
271+
# pair[0] keeps a copy of the confusable code point itself but as integer.
272+
# pair[1] keeps a list of all the code points that are mixed script confusable with it.
273+
# which is only used for debugging purposes.
274+
# note that the string 'multi' will occur in the list when pair[0] is considered
275+
# confusable with its multiple code point prototype.
276+
# Usually the `mixedscript_confusable_unresolved` table is empty, but it's possible
277+
# that future Unicode version update may cause that table become nonempty, in which
278+
# case more rules needs to be added to the `process_mixedscript_single_to_multi` function
279+
# above to cover those new cases.
242280
def load_potential_mixedscript_confusables(f, identifier_allowed, scripts):
243281
# First, load all confusables data from confusables.txt
244282
confusables = load_confusables(f)
@@ -375,6 +413,7 @@ def codepoint_script(c, scripts):
375413
return script
376414
raise Exception("Not in scripts: " + escape_char(c))
377415

416+
# Emit some useful information for debugging when further update happens.
378417
def debug_emit_mixedscript_confusable(f, mixedscript_confusable, text, scripts):
379418
f.write("/* " + text + "\n")
380419
for script, lst in mixedscript_confusable.items():

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy