Skip to content

Commit 3d59da9

Browse files
committed
unaccent: Make generate_unaccent_rules.py Python 3 compatible
Python 2 is still supported. Author: Hugh Ranalli <hugh@whtc.ca> Discussion: https://www.postgresql.org/message-id/CAAhbUMNyZ+PhNr_mQ=G161K0-hvbq13Tz2is9M3WK+yX9cQOCw@mail.gmail.com
1 parent d33faa2 commit 3d59da9

File tree

1 file changed

+24
-6
lines changed

1 file changed

+24
-6
lines changed

contrib/unaccent/generate_unaccent_rules.py

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#!/usr/bin/python2
1+
#!/usr/bin/python
22
# -*- coding: utf-8 -*-
33
#
44
# This script builds unaccent.rules on standard output when given the
@@ -23,6 +23,24 @@
2323
# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
2424
# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml
2525

26+
# BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
27+
# The approach is to be Python3 compatible with Python2 "backports".
28+
from __future__ import print_function
29+
from __future__ import unicode_literals
30+
import codecs
31+
import sys
32+
33+
if sys.version_info[0] <= 2:
34+
# Encode stdout as UTF-8, so we can just print to it
35+
sys.stdout = codecs.getwriter('utf8')(sys.stdout)
36+
37+
# Map Python 2's chr to unichr
38+
chr = unichr
39+
40+
# Python 2 and 3 compatible bytes call
41+
def bytes(source, encoding='ascii', errors='strict'):
42+
return source.encode(encoding=encoding, errors=errors)
43+
# END: Python 2/3 compatibility - remove when Python 2 compatibility dropped
2644

2745
import re
2846
import argparse
@@ -39,7 +57,7 @@
3957
(0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
4058

4159
def print_record(codepoint, letter):
42-
print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
60+
print (chr(codepoint) + "\t" + letter)
4361

4462
class Codepoint:
4563
def __init__(self, id, general_category, combining_ids):
@@ -116,7 +134,7 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
116134
charactersSet = set()
117135

118136
# RegEx to parse rules
119-
rulePattern = re.compile(ur'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;')
137+
rulePattern = re.compile(r'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;')
120138

121139
# construct tree from XML
122140
transliterationTree = ET.parse(latinAsciiFilePath)
@@ -134,7 +152,7 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
134152
# Group 3: plain "trg" char. Empty if group 4 is not.
135153
# Group 4: plain "trg" char between quotes. Empty if group 3 is not.
136154
if matches is not None:
137-
src = matches.group(1) if matches.group(1) is not None else matches.group(2).decode('unicode-escape')
155+
src = matches.group(1) if matches.group(1) is not None else bytes(matches.group(2), 'UTF-8').decode('unicode-escape')
138156
trg = matches.group(3) if matches.group(3) is not None else matches.group(4)
139157

140158
# "'" and """ are escaped
@@ -195,10 +213,10 @@ def main(args):
195213
len(codepoint.combining_ids) > 1:
196214
if is_letter_with_marks(codepoint, table):
197215
charactersSet.add((codepoint.id,
198-
unichr(get_plain_letter(codepoint, table).id)))
216+
chr(get_plain_letter(codepoint, table).id)))
199217
elif args.noLigaturesExpansion is False and is_ligature(codepoint, table):
200218
charactersSet.add((codepoint.id,
201-
"".join(unichr(combining_codepoint.id)
219+
"".join(chr(combining_codepoint.id)
202220
for combining_codepoint \
203221
in get_plain_letters(codepoint, table))))
204222

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy