diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 7b2baef..01eb38f 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ The :py:mod:`nameparser.config` module manages the configuration of the -nameparser. +nameparser. A module-level instance of :py:class:`~nameparser.config.Constants` is created and used by default for all HumanName instances. You can adjust the entire module's @@ -25,7 +25,7 @@ >>> hn.parse_full_name() # need to run this again after config changes **Potential Gotcha**: If you do not pass ``None`` as the second argument, -``hn.C`` will be a reference to the module config, possibly yielding +``hn.C`` will be a reference to the module config, possibly yielding unexpected results. See `Customizing the Parser `_. """ from __future__ import unicode_literals @@ -57,7 +57,7 @@ class SetManager(Set): Only special functionality beyond that provided by set() is to normalize constants for comparison (lower case, no periods) - when they are add()ed and remove()d and allow passing multiple + when they are add()ed and remove()d and allow passing multiple string arguments to the :py:func:`add()` and :py:func:`remove()` methods. ''' @@ -125,7 +125,7 @@ def remove(self, *strings): class TupleManager(dict): ''' - A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants + A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants more friendly. ''' @@ -148,23 +148,23 @@ class Constants(object): """ An instance of this class hold all of the configuration constants for the parser. - :param set prefixes: + :param set prefixes: :py:attr:`prefixes` wrapped with :py:class:`SetManager`. - :param set titles: + :param set titles: :py:attr:`titles` wrapped with :py:class:`SetManager`. - :param set first_name_titles: + :param set first_name_titles: :py:attr:`~titles.FIRST_NAME_TITLES` wrapped with :py:class:`SetManager`. - :param set suffix_acronyms: + :param set suffix_acronyms: :py:attr:`~suffixes.SUFFIX_ACRONYMS` wrapped with :py:class:`SetManager`. - :param set suffix_not_acronyms: + :param set suffix_not_acronyms: :py:attr:`~suffixes.SUFFIX_NOT_ACRONYMS` wrapped with :py:class:`SetManager`. - :param set conjunctions: + :param set conjunctions: :py:attr:`conjunctions` wrapped with :py:class:`SetManager`. :type capitalization_exceptions: tuple or dict - :param capitalization_exceptions: + :param capitalization_exceptions: :py:attr:`~capitalization.CAPITALIZATION_EXCEPTIONS` wrapped with :py:class:`TupleManager`. :type regexes: tuple or dict - :param regexes: + :param regexes: :py:attr:`regexes` wrapped with :py:class:`TupleManager`. """ @@ -187,9 +187,9 @@ class Constants(object): empty_attribute_default = '' """ Default return value for empty attributes. - + .. doctest:: - + >>> from nameparser.config import CONSTANTS >>> CONSTANTS.empty_attribute_default = None >>> name = HumanName("John Doe") @@ -197,7 +197,7 @@ class Constants(object): None >>>name.first 'John' - + """ capitalize_name = False @@ -231,6 +231,11 @@ class Constants(object): """ + try_russian_name_specifics = False + """ + If set, the parser will attempt to parse names in the Russian order (Last First Middle) + """ + def __init__(self, prefixes=PREFIXES, suffix_acronyms=SUFFIX_ACRONYMS, @@ -239,7 +244,8 @@ def __init__(self, first_name_titles=FIRST_NAME_TITLES, conjunctions=CONJUNCTIONS, capitalization_exceptions=CAPITALIZATION_EXCEPTIONS, - regexes=REGEXES + regexes=REGEXES, + try_russian_name_specifics=False, ): self.prefixes = SetManager(prefixes) self.suffix_acronyms = SetManager(suffix_acronyms) @@ -249,6 +255,7 @@ def __init__(self, self.conjunctions = SetManager(conjunctions) self.capitalization_exceptions = TupleManager(capitalization_exceptions) self.regexes = TupleManager(regexes) + self.try_russian_name_specifics = try_russian_name_specifics self._pst = None @property diff --git a/nameparser/config/regexes.py b/nameparser/config/regexes.py index bd4b320..4ba7155 100644 --- a/nameparser/config/regexes.py +++ b/nameparser/config/regexes.py @@ -8,14 +8,14 @@ re_emoji = re.compile('[' '\U0001F300-\U0001F64F' '\U0001F680-\U0001F6FF' - '\u2600-\u26FF\u2700-\u27BF]+', + '\u2600-\u26FF\u2700-\u27BF]+', re.UNICODE) except re.error: # Narrow UCS-2 build re_emoji = re.compile('(' '\ud83c[\udf00-\udfff]|' '\ud83d[\udc00-\ude4f\ude80-\udeff]|' - '[\u2600-\u26FF\u2700-\u27BF])+', + '[\u2600-\u26FF\u2700-\u27BF])+', re.UNICODE) REGEXES = set([ @@ -31,6 +31,12 @@ ("period_not_at_end",re.compile(r'.*\..+$', re.I | re.U)), ("emoji",re_emoji), ("phd", re.compile(r'\s(ph\.?\s+d\.?)', re.I | re.U)), + ("russian_last_name_endings", re.compile(r'^.+(ov|ova|ev|eva|yov|yova|in|yn|ina|sky|skaya|ich|ych|uk|yuk|yk|ko|ak|ukh|ykh|ikh|chuk|yy|yi|oy|oi|iy|ii)$', re.I | re.U)), + ("russian_last_name_endings_cyrillic", re.compile(r'^.+(ов|ова|ев|ева|ёв|ёва|ин|ын|ина|ский|ская|цкая|цкий|ич|ыч|ук|юк|ык|ко|ак|ух|ых|их|чук|ый|ой|ий)$', re.I | re.U)), + ("russian_patronymic_endings", re.compile(r'^(.+(ovich|ovna|evich|evna|ichna))|(ilyich|kuzmich|lukich|fomich|fokich)$', re.I | re.U)), + ("russian_patronymic_endings_cyrillic", re.compile(r'^(.+(ович|овна|евич|евна|ична))|(ильич|кузьмич|лукич|фомич|фокич)$', re.I | re.U)), + ("turkic_patronymic_suffixes", re.compile(r'^(oglu|ogly|qizi|kizi|kyzy|gyzy|uly|uulu)$', re.I | re.U)), + ("turkic_patronymic_suffixes_cyrillic", re.compile(r'^(оглу|оглы|кызы|гызы|улы|уулу)$', re.I | re.U)), ]) """ All regular expressions used by the parser are precompiled and stored in the config. diff --git a/nameparser/parser.py b/nameparser/parser.py index a5eb352..ab50e61 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -512,6 +512,8 @@ def post_process(self): and :py:func:`handle_capitalization`. """ self.handle_firstnames() + if self.C.try_russian_name_specifics: + self.handle_russian_name_specifics() self.handle_capitalization() def fix_phd(self): @@ -568,6 +570,76 @@ def handle_firstnames(self): and not lc(self.title) in self.C.first_name_titles: self.last, self.first = self.first, self.last + def is_turkic_patronymic(self, piece): + return self.C.regexes.turkic_patronymic_suffixes.match(piece) or self.C.regexes.turkic_patronymic_suffixes_cyrillic.match(piece) + + def handle_russian_name_specifics(self): + # Russian name order may have a last name first, + # so the order will be Last First Middle instead of First Middle Last (but without comma!) + # We can deduce this by checking EITHER if the first name looks like a russian last name, + # (but it currently breaks on names like Martin or Franklin or Benjamin - hence extra config parameter) + # OR if the last name looks like a russian patronymic + # (but it will break on name without patronymic and foreign last name like Olurombi Alexey <- Last First order), + # Another case: Last First instead of First Last. Then middle is empty. + is_name_order_lfm = (self.first and self.is_russian_last_name(self.first)) or ( + # if the middle name also looks like a russian patronymic, then it's a First Middle Last order, + # e.g. Roman Alexeevich Abramovich <- Abramovich does look like patronymic, but it's really a last name + self.last and self.is_russian_patronymic(self.last) and not self.is_russian_patronymic(self.middle) + ) or ( # some Russian citizens have patronymics of turkic origin, e.g. Said Ogly + self.last and self.is_turkic_patronymic(self.last) + ) + + # rare case: last name consists of two or more words separated by space + # one of them got incorrectly parsed as first/middle name, + # Russian middle names are patronymics, and consist of one word only + if len(self.middle_list) > 1: + # exception to this rule: turkic origin patronymics (e.g. Said Ogly <- two pieces!) + if is_name_order_lfm: + if self.is_turkic_patronymic(self.last): + # e.g "Ahmedov Oktay Said Ogly" <- Said should be moved to Ogly + self.last_list = self.middle_list[1:] + self.last_list + self.middle_list = [self.middle_list[0]] + else: + # then the second word gets parsed as middle name (if the last name goes first in the user input) + # take all elements of middle_list except the last one and append them to first_list + # (it will be rotated to last_list) + self.first_list += self.middle_list[:-1] + # the last element of middle_list is the new middle name (will be rotated to first_list) + self.middle_list = [self.middle_list[-1]] + else: + if self.is_turkic_patronymic(self.middle_list[-1]): + pass # no specific treatment needed + else: + # if the last name goes last in the user input, then all parts except the last get parsed as middle name + # fix that + self.last_list = self.middle_list[1:] + self.last_list + self.middle_list = [self.middle_list[0]] + + if is_name_order_lfm: + # # which is parsed as last name but should be in middle name + if self.middle: + # rotate the name components + self.first, self.middle, self.last = self.middle, self.last, self.first + else: + self.first, self.last = self.last, self.first + + def is_russian_last_name(self, piece): + """ + If the last name ends in a slavic suffix, it's a last name. + """ + # some first names match these regexes, so we check them first + if piece.lower() in ['lev', 'eva', 'yacov', 'yakov', 'veniamin', + 'lyubov', 'lubov', 'nina', + 'лев', 'ева', 'яков', 'вениамин', + 'нина']: + return False + if self.C.regexes.russian_last_name_endings.match(piece) or self.C.regexes.russian_last_name_endings_cyrillic.match(piece): + return True + return False + + def is_russian_patronymic(self, piece): + return self.C.regexes.russian_patronymic_endings.match(piece) or self.C.regexes.russian_patronymic_endings_cyrillic.match(piece) + def parse_full_name(self): """ @@ -764,7 +836,7 @@ def parse_pieces(self, parts, additional_parts_count=0): titles = list(filter(self.is_title, period_chunks)) suffixes = list(filter(self.is_suffix, period_chunks)) - # add the part to the constant so it will be found + # add the part to the constant so it will be found if len(list(titles)): self.C.titles.add(part) continue diff --git a/tests.py b/tests.py index 2cdd526..06c66db 100644 --- a/tests.py +++ b/tests.py @@ -2387,6 +2387,87 @@ def test_constructor_multiple(self): self.m(hn.title, "mytitle", hn) +class RussianNameOrderTestCase(HumanNameTestBase): + C = Constants(try_russian_name_specifics=True) + + def test_russian_name_specific_order(self): + hn = HumanName("Zarubkin Alexander Sergeevich", constants=self.C) + self.m(hn.first, "Alexander", hn) + self.m(hn.middle, "Sergeevich", hn) + self.m(hn.last, "Zarubkin", hn) + + def test_specific_order_without_patronymic(self): + hn = HumanName("Zarubkin Alexander", constants=self.C) + self.m(hn.first, "Alexander", hn) + self.m(hn.last, "Zarubkin", hn) + + def test_last_name_with_dash_specific_order(self): + hn = HumanName("Blokin-Mechtalin Konstantin Yurievich", constants=self.C) + self.m(hn.first, "Konstantin", hn) + self.m(hn.middle, "Yurievich", hn) + self.m(hn.last, "Blokin-Mechtalin", hn) + + def test_russian_name_with_african_origin(self): + hn = HumanName("Alexey Richardovich Olurombi Akinwale", constants=self.C) + self.m(hn.first, "Alexey", hn) + self.m(hn.middle, "Richardovich", hn) + self.m(hn.last, "Olurombi Akinwale", hn) + + def test_russian_name_specific_order_with_african_origin(self): + hn = HumanName("Olurombi Akinwale Alexey Richardovich", constants=self.C) + self.m(hn.first, "Alexey", hn) + self.m(hn.middle, "Richardovich", hn) + self.m(hn.last, "Olurombi Akinwale", hn) + + def test_last_name_like_russian_patronymic(self): + hn = HumanName("Sergey Vitalyevich Petsevich", constants=self.C) + self.m(hn.first, "Sergey", hn) + self.m(hn.middle, "Vitalyevich", hn) + self.m(hn.last, "Petsevich", hn) + + def test_last_name_like_russian_patronymic_specific_order(self): + hn = HumanName("Petsevich Sergey Vitalyevich", constants=self.C) + self.m(hn.first, "Sergey", hn) + self.m(hn.middle, "Vitalyevich", hn) + self.m(hn.last, "Petsevich", hn) + + def test_turkic_patronymic(self): + hn = HumanName("Leyla Said Gyzy Ahmedova", constants=self.C) + self.m(hn.first, "Leyla", hn) + self.m(hn.middle, "Said Gyzy", hn) + self.m(hn.last, "Ahmedova", hn) + + def test_turkic_patronymic_specific_order(self): + hn = HumanName("Ahmedova Leyla Said Gyzy", constants=self.C) + self.m(hn.first, "Leyla", hn) + self.m(hn.middle, "Said Gyzy", hn) + self.m(hn.last, "Ahmedova", hn) + + # these surnames end with -y (-ый/-ий in Russian) which I would rather not add to the Russian last names endings list + # as the resulting regex would be too broad + # However, if the first name is followed by patronymic, it will be caught and parsed properly + # If it is transliterated as -yi/-yy/-iy/-ii instead of -y, it will also be recognized properly + # It's a shame the usual transliteration of -ый/-ий to English is -y (e.g. Sikorsky) + # I guess it follows the rules for similar last names in Polish language. + # Most popular endings for -y: -ский/-цкий (-sky/-tsky) are already covered, but corner cases like this one remain. + @unittest.expectedFailure + def test_tricky_case1(self): + hn = HumanName("Mogilny Alexander", constants=self.C) # famous hockey player + self.m(hn.first, "Alexander", hn) + self.m(hn.last, "Mogilny", hn) + + def test_tricky_case2(self): + hn = HumanName("Mogilny Alexander Gennadyevich", constants=self.C) # famous hockey player + self.m(hn.first, "Alexander", hn) + self.m(hn.middle, "Gennadyevich", hn) + self.m(hn.last, "Mogilny", hn) + + def test_tricky_case3(self): + hn = HumanName("Mogilnyy Alexander", constants=self.C) # famous hockey player + self.m(hn.first, "Alexander", hn) + self.m(hn.last, "Mogilnyy", hn) + + TEST_NAMES = ( "John Doe", "John Doe, Jr.", pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy