Skip to content

Commit 5e8d670

Browse files
committed
Add Greek characters to unaccent.rules.
Author: Tasos Maschalidis Reviewed-by: Michael Paquier, Tom Lane Discussion: https://postgr.es/m/153495048900.1368.11566580687623014380%40wrigleys.postgresql.org Discussion: https://postgr.es/m/VI1PR01MB38537EBD529FE5EE3FE9A5FEB5370%40VI1PR01MB3853.eurprd01.prod.exchangelabs.com
1 parent ec74369 commit 5e8d670

File tree

2 files changed

+236
-4
lines changed

2 files changed

+236
-4
lines changed

contrib/unaccent/generate_unaccent_rules.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,15 @@
2929
import sys
3030
import xml.etree.ElementTree as ET
3131

32+
# The ranges of Unicode characters that we consider to be "plain letters".
33+
# For now we are being conservative by including only Latin and Greek. This
34+
# could be extended in future based on feedback from people with relevant
35+
# language knowledge.
36+
PLAIN_LETTER_RANGES = ((ord('a'), ord('z')), # Latin lower case
37+
(ord('A'), ord('Z')), # Latin upper case
38+
(0x03b1, 0x03c9), # GREEK SMALL LETTER ALPHA, GREEK SMALL LETTER OMEGA
39+
(0x0391, 0x03a9)) # GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
40+
3241
def print_record(codepoint, letter):
3342
print (unichr(codepoint) + "\t" + letter).encode("UTF-8")
3443

@@ -39,9 +48,11 @@ def __init__(self, id, general_category, combining_ids):
3948
self.combining_ids = combining_ids
4049

4150
def is_plain_letter(codepoint):
42-
"""Return true if codepoint represents a plain ASCII letter."""
43-
return (codepoint.id >= ord('a') and codepoint.id <= ord('z')) or \
44-
(codepoint.id >= ord('A') and codepoint.id <= ord('Z'))
51+
"""Return true if codepoint represents a "plain letter"."""
52+
for begin, end in PLAIN_LETTER_RANGES:
53+
if codepoint.id >= begin and codepoint.id <= end:
54+
return True
55+
return False
4556

4657
def is_mark(codepoint):
4758
"""Returns true for diacritical marks (combining codepoints)."""
@@ -184,7 +195,7 @@ def main(args):
184195
len(codepoint.combining_ids) > 1:
185196
if is_letter_with_marks(codepoint, table):
186197
charactersSet.add((codepoint.id,
187-
chr(get_plain_letter(codepoint, table).id)))
198+
unichr(get_plain_letter(codepoint, table).id)))
188199
elif args.noLigaturesExpansion is False and is_ligature(codepoint, table):
189200
charactersSet.add((codepoint.id,
190201
"".join(unichr(combining_codepoint.id)

contrib/unaccent/unaccent.rules

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,26 @@
399399
ʦ ts
400400
ʪ ls
401401
ʫ lz
402+
Ά Α
403+
Έ Ε
404+
Ή Η
405+
Ί Ι
406+
Ό Ο
407+
Ύ Υ
408+
Ώ Ω
409+
ΐ ι
410+
Ϊ Ι
411+
Ϋ Υ
412+
ά α
413+
έ ε
414+
ή η
415+
ί ι
416+
ΰ υ
417+
ϊ ι
418+
ϋ υ
419+
ό ο
420+
ύ υ
421+
ώ ω
402422
Ё Е
403423
ё е
404424
ᴀ A
@@ -709,6 +729,207 @@
709729
ỽ v
710730
Ỿ Y
711731
ỿ y
732+
ἀ α
733+
ἁ α
734+
ἂ α
735+
ἃ α
736+
ἄ α
737+
ἅ α
738+
ἆ α
739+
ἇ α
740+
Ἀ Α
741+
Ἁ Α
742+
Ἂ Α
743+
Ἃ Α
744+
Ἄ Α
745+
Ἅ Α
746+
Ἆ Α
747+
Ἇ Α
748+
ἐ ε
749+
ἑ ε
750+
ἒ ε
751+
ἓ ε
752+
ἔ ε
753+
ἕ ε
754+
Ἐ Ε
755+
Ἑ Ε
756+
Ἒ Ε
757+
Ἓ Ε
758+
Ἔ Ε
759+
Ἕ Ε
760+
ἠ η
761+
ἡ η
762+
ἢ η
763+
ἣ η
764+
ἤ η
765+
ἥ η
766+
ἦ η
767+
ἧ η
768+
Ἠ Η
769+
Ἡ Η
770+
Ἢ Η
771+
Ἣ Η
772+
Ἤ Η
773+
Ἥ Η
774+
Ἦ Η
775+
Ἧ Η
776+
ἰ ι
777+
ἱ ι
778+
ἲ ι
779+
ἳ ι
780+
ἴ ι
781+
ἵ ι
782+
ἶ ι
783+
ἷ ι
784+
Ἰ Ι
785+
Ἱ Ι
786+
Ἲ Ι
787+
Ἳ Ι
788+
Ἴ Ι
789+
Ἵ Ι
790+
Ἶ Ι
791+
Ἷ Ι
792+
ὀ ο
793+
ὁ ο
794+
ὂ ο
795+
ὃ ο
796+
ὄ ο
797+
ὅ ο
798+
Ὀ Ο
799+
Ὁ Ο
800+
Ὂ Ο
801+
Ὃ Ο
802+
Ὄ Ο
803+
Ὅ Ο
804+
ὐ υ
805+
ὑ υ
806+
ὒ υ
807+
ὓ υ
808+
ὔ υ
809+
ὕ υ
810+
ὖ υ
811+
ὗ υ
812+
Ὑ Υ
813+
Ὓ Υ
814+
Ὕ Υ
815+
Ὗ Υ
816+
ὠ ω
817+
ὡ ω
818+
ὢ ω
819+
ὣ ω
820+
ὤ ω
821+
ὥ ω
822+
ὦ ω
823+
ὧ ω
824+
Ὠ Ω
825+
Ὡ Ω
826+
Ὢ Ω
827+
Ὣ Ω
828+
Ὤ Ω
829+
Ὥ Ω
830+
Ὦ Ω
831+
Ὧ Ω
832+
ὰ α
833+
ὲ ε
834+
ὴ η
835+
ὶ ι
836+
ὸ ο
837+
ὺ υ
838+
ὼ ω
839+
ᾀ α
840+
ᾁ α
841+
ᾂ α
842+
ᾃ α
843+
ᾄ α
844+
ᾅ α
845+
ᾆ α
846+
ᾇ α
847+
ᾈ Α
848+
ᾉ Α
849+
ᾊ Α
850+
ᾋ Α
851+
ᾌ Α
852+
ᾍ Α
853+
ᾎ Α
854+
ᾏ Α
855+
ᾐ η
856+
ᾑ η
857+
ᾒ η
858+
ᾓ η
859+
ᾔ η
860+
ᾕ η
861+
ᾖ η
862+
ᾗ η
863+
ᾘ Η
864+
ᾙ Η
865+
ᾚ Η
866+
ᾛ Η
867+
ᾜ Η
868+
ᾝ Η
869+
ᾞ Η
870+
ᾟ Η
871+
ᾠ ω
872+
ᾡ ω
873+
ᾢ ω
874+
ᾣ ω
875+
ᾤ ω
876+
ᾥ ω
877+
ᾦ ω
878+
ᾧ ω
879+
ᾨ Ω
880+
ᾩ Ω
881+
ᾪ Ω
882+
ᾫ Ω
883+
ᾬ Ω
884+
ᾭ Ω
885+
ᾮ Ω
886+
ᾯ Ω
887+
ᾰ α
888+
ᾱ α
889+
ᾲ α
890+
ᾳ α
891+
ᾴ α
892+
ᾶ α
893+
ᾷ α
894+
Ᾰ Α
895+
Ᾱ Α
896+
Ὰ Α
897+
ᾼ Α
898+
ῂ η
899+
ῃ η
900+
ῄ η
901+
ῆ η
902+
ῇ η
903+
Ὲ Ε
904+
Ὴ Η
905+
ῌ Η
906+
ῐ ι
907+
ῑ ι
908+
ῒ ι
909+
ῖ ι
910+
ῗ ι
911+
Ῐ Ι
912+
Ῑ Ι
913+
Ὶ Ι
914+
ῠ υ
915+
ῡ υ
916+
ῢ υ
917+
ῤ ρ
918+
ῥ ρ
919+
ῦ υ
920+
ῧ υ
921+
Ῠ Υ
922+
Ῡ Υ
923+
Ὺ Υ
924+
Ῥ Ρ
925+
ῲ ω
926+
ῳ ω
927+
ῴ ω
928+
ῶ ω
929+
ῷ ω
930+
Ὸ Ο
931+
Ὼ Ω
932+
ῼ Ω
712933
‐ -
713934
‑ -
714935
‒ -

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy