Skip to content

Commit ec0a69e

Browse files
committed
Extend the default rules file for contrib/unaccent with Vietnamese letters.
Improve generate_unaccent_rules.py to handle composed characters whose base is another composed character rather than a plain letter. The net effect of this is to add a bunch of multi-accented Vietnamese characters to unaccent.rules. Original complaint from Kha Nguyen, diagnosis of the script's shortcoming by Thomas Munro. Dang Minh Huong and Michael Paquier Discussion: https://postgr.es/m/CALo3sF6EC8cy1F2JUz=GRf5h4LMUJTaG3qpdoiLrNbWEXL-tRg@mail.gmail.com
1 parent 2b74303 commit ec0a69e

File tree

2 files changed

+145
-8
lines changed

2 files changed

+145
-8
lines changed

contrib/unaccent/generate_unaccent_rules.py

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,24 +48,47 @@ def is_mark(codepoint):
4848
return codepoint.general_category in ("Mn", "Me", "Mc")
4949

5050
def is_letter_with_marks(codepoint, table):
51-
"""Returns true for plain letters combined with one or more marks."""
51+
"""Returns true for letters combined with one or more marks."""
5252
# See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
53-
return len(codepoint.combining_ids) > 1 and \
54-
is_plain_letter(table[codepoint.combining_ids[0]]) and \
55-
all(is_mark(table[i]) for i in codepoint.combining_ids[1:])
53+
54+
# Letter may have no combining characters, in which case it has
55+
# no marks.
56+
if len(codepoint.combining_ids) == 1:
57+
return False
58+
59+
# A letter without diacritical marks has none of them.
60+
if any(is_mark(table[i]) for i in codepoint.combining_ids[1:]) is False:
61+
return False
62+
63+
# Check if the base letter of this letter has marks.
64+
codepoint_base = codepoint.combining_ids[0]
65+
if (is_plain_letter(table[codepoint_base]) is False and \
66+
is_letter_with_marks(table[codepoint_base], table) is False):
67+
return False
68+
69+
return True
5670

5771
def is_letter(codepoint, table):
5872
"""Return true for letter with or without diacritical marks."""
5973
return is_plain_letter(codepoint) or is_letter_with_marks(codepoint, table)
6074

6175
def get_plain_letter(codepoint, table):
62-
"""Return the base codepoint without marks."""
76+
"""Return the base codepoint without marks. If this codepoint has more
77+
than one combining character, do a recursive lookup on the table to
78+
find out its plain base letter."""
6379
if is_letter_with_marks(codepoint, table):
64-
return table[codepoint.combining_ids[0]]
80+
if len(table[codepoint.combining_ids[0]].combining_ids) > 1:
81+
return get_plain_letter(table[codepoint.combining_ids[0]], table)
82+
elif is_plain_letter(table[codepoint.combining_ids[0]]):
83+
return table[codepoint.combining_ids[0]]
84+
85+
# Should not come here
86+
assert(False)
6587
elif is_plain_letter(codepoint):
6688
return codepoint
67-
else:
68-
raise "mu"
89+
90+
# Should not come here
91+
assert(False)
6992

7093
def is_ligature(codepoint, table):
7194
"""Return true for letters combined with letters."""

contrib/unaccent/unaccent.rules

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,18 @@
254254
ǒ o
255255
Ǔ U
256256
ǔ u
257+
Ǖ U
258+
ǖ u
259+
Ǘ U
260+
ǘ u
261+
Ǚ U
262+
ǚ u
263+
Ǜ U
264+
ǜ u
265+
Ǟ A
266+
ǟ a
267+
Ǡ A
268+
ǡ a
257269
Ǥ G
258270
ǥ g
259271
Ǧ G
@@ -262,6 +274,8 @@
262274
ǩ k
263275
Ǫ O
264276
ǫ o
277+
Ǭ O
278+
ǭ o
265279
ǰ j
266280
DZ DZ
267281
Dz Dz
@@ -270,6 +284,8 @@
270284
ǵ g
271285
Ǹ N
272286
ǹ n
287+
Ǻ A
288+
ǻ a
273289
Ȁ A
274290
ȁ a
275291
Ȃ A
@@ -307,8 +323,14 @@
307323
ȧ a
308324
Ȩ E
309325
ȩ e
326+
Ȫ O
327+
ȫ o
328+
Ȭ O
329+
ȭ o
310330
Ȯ O
311331
ȯ o
332+
Ȱ O
333+
ȱ o
312334
Ȳ Y
313335
ȳ y
314336
ȴ l
@@ -441,6 +463,8 @@
441463
ḅ b
442464
Ḇ B
443465
ḇ b
466+
Ḉ C
467+
ḉ c
444468
Ḋ D
445469
ḋ d
446470
Ḍ D
@@ -451,10 +475,16 @@
451475
ḑ d
452476
Ḓ D
453477
ḓ d
478+
Ḕ E
479+
ḕ e
480+
Ḗ E
481+
ḗ e
454482
Ḙ E
455483
ḙ e
456484
Ḛ E
457485
ḛ e
486+
Ḝ E
487+
ḝ e
458488
Ḟ F
459489
ḟ f
460490
Ḡ G
@@ -471,6 +501,8 @@
471501
ḫ h
472502
Ḭ I
473503
ḭ i
504+
Ḯ I
505+
ḯ i
474506
Ḱ K
475507
ḱ k
476508
Ḳ K
@@ -479,6 +511,8 @@
479511
ḵ k
480512
Ḷ L
481513
ḷ l
514+
Ḹ L
515+
ḹ l
482516
Ḻ L
483517
ḻ l
484518
Ḽ L
@@ -497,6 +531,14 @@
497531
ṉ n
498532
Ṋ N
499533
ṋ n
534+
Ṍ O
535+
ṍ o
536+
Ṏ O
537+
ṏ o
538+
Ṑ O
539+
ṑ o
540+
Ṓ O
541+
ṓ o
500542
Ṕ P
501543
ṕ p
502544
Ṗ P
@@ -505,12 +547,20 @@
505547
ṙ r
506548
Ṛ R
507549
ṛ r
550+
Ṝ R
551+
ṝ r
508552
Ṟ R
509553
ṟ r
510554
Ṡ S
511555
ṡ s
512556
Ṣ S
513557
ṣ s
558+
Ṥ S
559+
ṥ s
560+
Ṧ S
561+
ṧ s
562+
Ṩ S
563+
ṩ s
514564
Ṫ T
515565
ṫ t
516566
Ṭ T
@@ -525,6 +575,10 @@
525575
ṵ u
526576
Ṷ U
527577
ṷ u
578+
Ṹ U
579+
ṹ u
580+
Ṻ U
581+
ṻ u
528582
Ṽ V
529583
ṽ v
530584
Ṿ V
@@ -563,12 +617,42 @@
563617
ạ a
564618
Ả A
565619
ả a
620+
Ấ A
621+
ấ a
622+
Ầ A
623+
ầ a
624+
Ẩ A
625+
ẩ a
626+
Ẫ A
627+
ẫ a
628+
Ậ A
629+
ậ a
630+
Ắ A
631+
ắ a
632+
Ằ A
633+
ằ a
634+
Ẳ A
635+
ẳ a
636+
Ẵ A
637+
ẵ a
638+
Ặ A
639+
ặ a
566640
Ẹ E
567641
ẹ e
568642
Ẻ E
569643
ẻ e
570644
Ẽ E
571645
ẽ e
646+
Ế E
647+
ế e
648+
Ề E
649+
ề e
650+
Ể E
651+
ể e
652+
Ễ E
653+
ễ e
654+
Ệ E
655+
ệ e
572656
Ỉ I
573657
ỉ i
574658
Ị I
@@ -577,10 +661,40 @@
577661
ọ o
578662
Ỏ O
579663
ỏ o
664+
Ố O
665+
ố o
666+
Ồ O
667+
ồ o
668+
Ổ O
669+
ổ o
670+
Ỗ O
671+
ỗ o
672+
Ộ O
673+
ộ o
674+
Ớ O
675+
ớ o
676+
Ờ O
677+
ờ o
678+
Ở O
679+
ở o
680+
Ỡ O
681+
ỡ o
682+
Ợ O
683+
ợ o
580684
Ụ U
581685
ụ u
582686
Ủ U
583687
ủ u
688+
Ứ U
689+
ứ u
690+
Ừ U
691+
ừ u
692+
Ử U
693+
ử u
694+
Ữ U
695+
ữ u
696+
Ự U
697+
ự u
584698
Ỳ Y
585699
ỳ y
586700
Ỵ Y

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy