Skip to content

Commit 456e371

Browse files
committed
Add combining characters to unaccent.rules.
Strip certain classes of combining characters, so that accents encoded this way are removed. Author: Hugh Ranalli Discussion: https://postgr.es/m/15548-cef1b3f8de190d4f%40postgresql.org
1 parent 80579f9 commit 456e371

File tree

4 files changed

+157
-1
lines changed

4 files changed

+157
-1
lines changed

contrib/unaccent/expected/unaccent.out

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,12 @@ SELECT unaccent('˃˖˗˜');
3131
>+-~
3232
(1 row)
3333

34+
SELECT unaccent('À'); -- Remove combining diacritical 0x0300
35+
unaccent
36+
----------
37+
A
38+
(1 row)
39+
3440
SELECT unaccent('unaccent', 'foobar');
3541
unaccent
3642
----------
@@ -55,6 +61,12 @@ SELECT unaccent('unaccent', '˃˖˗˜');
5561
>+-~
5662
(1 row)
5763

64+
SELECT unaccent('unaccent', 'À');
65+
unaccent
66+
----------
67+
A
68+
(1 row)
69+
5870
SELECT ts_lexize('unaccent', 'foobar');
5971
ts_lexize
6072
-----------
@@ -79,3 +91,9 @@ SELECT ts_lexize('unaccent', '˃˖˗˜');
7991
{>+-~}
8092
(1 row)
8193

94+
SELECT ts_lexize('unaccent', 'À');
95+
ts_lexize
96+
-----------
97+
{A}
98+
(1 row)
99+

contrib/unaccent/generate_unaccent_rules.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,42 @@ def bytes(source, encoding='ascii', errors='strict'):
6161
(0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
6262
(0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
6363

64+
# Combining marks follow a "base" character, and result in a composite
65+
# character. Example: "U&'A\0300'"produces "À".There are three types of
66+
# combining marks: enclosing (Me), non-spacing combining (Mn), spacing
67+
# combining (Mc). We identify the ranges of marks we feel safe removing.
68+
# References:
69+
# https://en.wikipedia.org/wiki/Combining_character
70+
# https://www.unicode.org/charts/PDF/U0300.pdf
71+
# https://www.unicode.org/charts/PDF/U20D0.pdf
72+
COMBINING_MARK_RANGES = ((0x0300, 0x0362), # Mn: Accents, IPA
73+
(0x20dd, 0x20E0), # Me: Symbols
74+
(0x20e2, 0x20e4),) # Me: Screen, keycap, triangle
75+
6476
def print_record(codepoint, letter):
65-
print (chr(codepoint) + "\t" + letter)
77+
if letter:
78+
output = chr(codepoint) + "\t" + letter
79+
else:
80+
output = chr(codepoint)
81+
82+
print(output)
6683

6784
class Codepoint:
6885
def __init__(self, id, general_category, combining_ids):
6986
self.id = id
7087
self.general_category = general_category
7188
self.combining_ids = combining_ids
7289

90+
def is_mark_to_remove(codepoint):
91+
"""Return true if this is a combining mark to remove."""
92+
if not is_mark(codepoint):
93+
return False
94+
95+
for begin, end in COMBINING_MARK_RANGES:
96+
if codepoint.id >= begin and codepoint.id <= end:
97+
return True
98+
return False
99+
73100
def is_plain_letter(codepoint):
74101
"""Return true if codepoint represents a "plain letter"."""
75102
for begin, end in PLAIN_LETTER_RANGES:
@@ -234,6 +261,8 @@ def main(args):
234261
"".join(chr(combining_codepoint.id)
235262
for combining_codepoint \
236263
in get_plain_letters(codepoint, table))))
264+
elif is_mark_to_remove(codepoint):
265+
charactersSet.add((codepoint.id, None))
237266

238267
# add CLDR Latin-ASCII characters
239268
if not args.noLigaturesExpansion:

contrib/unaccent/sql/unaccent.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,16 @@ SELECT unaccent('foobar');
99
SELECT unaccent('ёлка');
1010
SELECT unaccent('ЁЖИК');
1111
SELECT unaccent('˃˖˗˜');
12+
SELECT unaccent(''); -- Remove combining diacritical 0x0300
1213

1314
SELECT unaccent('unaccent', 'foobar');
1415
SELECT unaccent('unaccent', 'ёлка');
1516
SELECT unaccent('unaccent', 'ЁЖИК');
1617
SELECT unaccent('unaccent', '˃˖˗˜');
18+
SELECT unaccent('unaccent', '');
1719

1820
SELECT ts_lexize('unaccent', 'foobar');
1921
SELECT ts_lexize('unaccent', 'ёлка');
2022
SELECT ts_lexize('unaccent', 'ЁЖИК');
2123
SELECT ts_lexize('unaccent', '˃˖˗˜');
24+
SELECT ts_lexize('unaccent', '');

contrib/unaccent/unaccent.rules

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,105 @@
414414
˖ +
415415
˗ -
416416
˜ ~
417+
̀
418+
́
419+
̂
420+
̃
421+
̄
422+
̅
423+
̆
424+
̇
425+
̈
426+
̉
427+
̊
428+
̋
429+
̌
430+
̍
431+
̎
432+
̏
433+
̐
434+
̑
435+
̒
436+
̓
437+
̔
438+
̕
439+
̖
440+
̗
441+
̘
442+
̙
443+
̚
444+
̛
445+
̜
446+
̝
447+
̞
448+
̟
449+
̠
450+
̡
451+
̢
452+
̣
453+
̤
454+
̥
455+
̦
456+
̧
457+
̨
458+
̩
459+
̪
460+
̫
461+
̬
462+
̭
463+
̮
464+
̯
465+
̰
466+
̱
467+
̲
468+
̳
469+
̴
470+
̵
471+
̶
472+
̷
473+
̸
474+
̹
475+
̺
476+
̻
477+
̼
478+
̽
479+
̾
480+
̿
481+
̀
482+
́
483+
͂
484+
̓
485+
̈́
486+
ͅ
487+
͆
488+
͇
489+
͈
490+
͉
491+
͊
492+
͋
493+
͌
494+
͍
495+
͎
496+
͏
497+
͐
498+
͑
499+
͒
500+
͓
501+
͔
502+
͕
503+
͖
504+
͗
505+
͘
506+
͙
507+
͚
508+
͛
509+
͜
510+
͝
511+
͞
512+
͟
513+
͠
514+
͡
515+
͢
417516
Ά Α
418517
Έ Ε
419518
Ή Η
@@ -982,6 +1081,13 @@
9821081
₧ Pts
9831082
₹ Rs
9841083
₺ TL
1084+
1085+
1086+
1087+
1088+
1089+
1090+
9851091
℀ a/c
9861092
℁ a/s
9871093
ℂ C

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy