Skip to content

Commit 9a206d0

Browse files
committed
Improve script generating unaccent rules
Script now use the standard Unicode transliterator Latin-ASCII. Author: Leonard Benedetti
1 parent 3aff33a commit 9a206d0

File tree

2 files changed

+762
-56
lines changed

2 files changed

+762
-56
lines changed

contrib/unaccent/generate_unaccent_rules.py

Lines changed: 107 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,33 @@
1-
#!/usr/bin/python
1+
#!/usr/bin/python2
2+
# -*- coding: utf-8 -*-
23
#
34
# This script builds unaccent.rules on standard output when given the
4-
# contents of UnicodeData.txt[1] on standard input. Optionally includes
5-
# ligature expansion, if --expand-ligatures is given on the command line.
5+
# contents of UnicodeData.txt [1] and Latin-ASCII.xml [2] given as
6+
# arguments. Optionally includes ligature expansion and Unicode CLDR
7+
# Latin-ASCII transliterator, enabled by default, this can be disabled
8+
# with "--no-ligatures-expansion" command line option.
69
#
710
# The approach is to use the Unicode decomposition data to identify
811
# precomposed codepoints that are equivalent to a ligature of several
912
# letters, or a base letter with any number of diacritical marks.
10-
# There is also a small set of special cases for codepoints that we
11-
# traditionally support even though Unicode doesn't consider them to
12-
# be ligatures or letters with marks.
1313
#
14-
# [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt
14+
# This approach handles most letters with diacritical marks and some
15+
# ligatures. However, several characters (notably a majority of
16+
# ligatures) don't have decomposition. To handle all these cases, one can
17+
# use a standard Unicode transliterator available in Common Locale Data
18+
# Repository (CLDR): Latin-ASCII. This transliterator associates Unicode
19+
# characters to ASCII-range equivalent. Unless "--no-ligatures-expansion"
20+
# option is enabled, the XML file of this transliterator [2] -- given as a
21+
# command line argument -- will be parsed and used.
22+
#
23+
# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
24+
# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml
25+
1526

1627
import re
28+
import argparse
1729
import sys
30+
import xml.etree.ElementTree as ET
1831

1932
def print_record(codepoint, letter):
2033
print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
@@ -63,15 +76,73 @@ def get_plain_letters(codepoint, table):
6376
assert(is_ligature(codepoint, table))
6477
return [get_plain_letter(table[id], table) for id in codepoint.combining_ids]
6578

66-
def main(expand_ligatures):
79+
def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
80+
"""Parse the XML file and return a set of tuples (src, trg), where "src"
81+
is the original character and "trg" the substitute."""
82+
charactersSet = set()
83+
84+
# RegEx to parse rules
85+
rulePattern = re.compile(ur'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;')
86+
87+
# construct tree from XML
88+
transliterationTree = ET.parse(latinAsciiFilePath)
89+
transliterationTreeRoot = transliterationTree.getroot()
90+
91+
for rule in transliterationTreeRoot.findall("./transforms/transform/tRule"):
92+
matches = rulePattern.search(rule.text)
93+
94+
# The regular expression capture four groups corresponding
95+
# to the characters.
96+
#
97+
# Group 1: plain "src" char. Empty if group 2 is not.
98+
# Group 2: unicode-espaced "src" char (e.g. "\u0110"). Empty if group 1 is not.
99+
#
100+
# Group 3: plain "trg" char. Empty if group 4 is not.
101+
# Group 4: plain "trg" char between quotes. Empty if group 3 is not.
102+
if matches is not None:
103+
src = matches.group(1) if matches.group(1) is not None else matches.group(2).decode('unicode-escape')
104+
trg = matches.group(3) if matches.group(3) is not None else matches.group(4)
105+
106+
# "'" and """ are escaped
107+
trg = trg.replace("\\'", "'").replace('\\"', '"')
108+
109+
# the parser of unaccent only accepts non-whitespace characters
110+
# for "src" and "trg" (see unaccent.c)
111+
if not src.isspace() and not trg.isspace():
112+
charactersSet.add((ord(src), trg))
113+
114+
return charactersSet
115+
116+
def special_cases():
117+
"""Returns the special cases which are not handled by other methods"""
118+
charactersSet = set()
119+
120+
# Cyrillic
121+
charactersSet.add((0x0401, u"\u0415")) # CYRILLIC CAPITAL LETTER IO
122+
charactersSet.add((0x0451, u"\u0435")) # CYRILLIC SMALL LETTER IO
123+
124+
# Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F)
125+
charactersSet.add((0x2103, u"\xb0C")) # DEGREE CELSIUS
126+
charactersSet.add((0x2109, u"\xb0F")) # DEGREE FAHRENHEIT
127+
charactersSet.add((0x2117, "(P)")) # SOUND RECORDING COPYRIGHT
128+
129+
return charactersSet
130+
131+
def main(args):
67132
# http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
68133
decomposition_type_pattern = re.compile(" *<[^>]*> *")
69134

70135
table = {}
71136
all = []
72137

138+
# unordered set for ensure uniqueness
139+
charactersSet = set()
140+
141+
# read file UnicodeData.txt
142+
unicodeDataFile = open(args.unicodeDataFilePath, 'r')
143+
73144
# read everything we need into memory
74-
for line in sys.stdin.readlines():
145+
for line in unicodeDataFile:
75146
fields = line.split(";")
76147
if len(fields) > 5:
77148
# http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
@@ -89,35 +160,34 @@ def main(expand_ligatures):
89160
if codepoint.general_category.startswith('L') and \
90161
len(codepoint.combining_ids) > 1:
91162
if is_letter_with_marks(codepoint, table):
92-
print_record(codepoint.id,
93-
chr(get_plain_letter(codepoint, table).id))
94-
elif expand_ligatures and is_ligature(codepoint, table):
95-
print_record(codepoint.id,
163+
charactersSet.add((codepoint.id,
164+
chr(get_plain_letter(codepoint, table).id)))
165+
elif args.noLigaturesExpansion is False and is_ligature(codepoint, table):
166+
charactersSet.add((codepoint.id,
96167
"".join(unichr(combining_codepoint.id)
97168
for combining_codepoint \
98-
in get_plain_letters(codepoint, table)))
99-
100-
# some special cases
101-
print_record(0x00d8, "O") # LATIN CAPITAL LETTER O WITH STROKE
102-
print_record(0x00f8, "o") # LATIN SMALL LETTER O WITH STROKE
103-
print_record(0x0110, "D") # LATIN CAPITAL LETTER D WITH STROKE
104-
print_record(0x0111, "d") # LATIN SMALL LETTER D WITH STROKE
105-
print_record(0x0131, "i") # LATIN SMALL LETTER DOTLESS I
106-
print_record(0x0126, "H") # LATIN CAPITAL LETTER H WITH STROKE
107-
print_record(0x0127, "h") # LATIN SMALL LETTER H WITH STROKE
108-
print_record(0x0141, "L") # LATIN CAPITAL LETTER L WITH STROKE
109-
print_record(0x0142, "l") # LATIN SMALL LETTER L WITH STROKE
110-
print_record(0x0149, "'n") # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
111-
print_record(0x0166, "T") # LATIN CAPITAL LETTER T WITH STROKE
112-
print_record(0x0167, "t") # LATIN SMALL LETTER t WITH STROKE
113-
print_record(0x0401, u"\u0415") # CYRILLIC CAPITAL LETTER IO
114-
print_record(0x0451, u"\u0435") # CYRILLIC SMALL LETTER IO
115-
if expand_ligatures:
116-
print_record(0x00c6, "AE") # LATIN CAPITAL LETTER AE
117-
print_record(0x00df, "ss") # LATIN SMALL LETTER SHARP S
118-
print_record(0x00e6, "ae") # LATIN SMALL LETTER AE
119-
print_record(0x0152, "OE") # LATIN CAPITAL LIGATURE OE
120-
print_record(0x0153, "oe") # LATIN SMALL LIGATURE OE
169+
in get_plain_letters(codepoint, table))))
170+
171+
# add CLDR Latin-ASCII characters
172+
if not args.noLigaturesExpansion:
173+
charactersSet |= parse_cldr_latin_ascii_transliterator(args.latinAsciiFilePath)
174+
charactersSet |= special_cases()
175+
176+
# sort for more convenient display
177+
charactersList = sorted(charactersSet, key=lambda characterPair: characterPair[0])
178+
179+
for characterPair in charactersList:
180+
print_record(characterPair[0], characterPair[1])
121181

122182
if __name__ == "__main__":
123-
main(len(sys.argv) == 2 and sys.argv[1] == "--expand-ligatures")
183+
parser = argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')
184+
parser.add_argument("--unicode-data-file", help="Path to formatted text file corresponding to UnicodeData.txt. See <http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt>.", type=str, required=True, dest='unicodeDataFilePath')
185+
parser.add_argument("--latin-ascii-file", help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml). See <http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml>.", type=str, dest='latinAsciiFilePath')
186+
parser.add_argument("--no-ligatures-expansion", help="Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and \"--latin-ascii-file\" argument is required. If this option is enabled, \"--latin-ascii-file\" argument is optional and ignored.", action="store_true", dest='noLigaturesExpansion')
187+
args = parser.parse_args()
188+
189+
if args.noLigaturesExpansion is False and args.latinAsciiFilePath is None:
190+
sys.stderr.write('You must specify the path to Latin-ASCII transliterator file with \"--latin-ascii-file\" option or use \"--no-ligatures-expansion\" option. Use \"-h\" option for help.')
191+
sys.exit(1)
192+
193+
main(args)

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy