diff --git a/nameparser/config/__init__.py b/nameparser/config/__init__.py index 7b2baef..ae4cbe8 100644 --- a/nameparser/config/__init__.py +++ b/nameparser/config/__init__.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- """ The :py:mod:`nameparser.config` module manages the configuration of the -nameparser. +nameparser. A module-level instance of :py:class:`~nameparser.config.Constants` is created and used by default for all HumanName instances. You can adjust the entire module's @@ -25,11 +25,12 @@ >>> hn.parse_full_name() # need to run this again after config changes **Potential Gotcha**: If you do not pass ``None`` as the second argument, -``hn.C`` will be a reference to the module config, possibly yielding +``hn.C`` will be a reference to the module config, possibly yielding unexpected results. See `Customizing the Parser `_. """ from __future__ import unicode_literals import sys + try: # Python 3.3+ from collections.abc import Set @@ -46,6 +47,7 @@ from nameparser.config.titles import TITLES from nameparser.config.titles import FIRST_NAME_TITLES from nameparser.config.regexes import REGEXES +from nameparser.config.affixes import AFFIXES DEFAULT_ENCODING = 'UTF-8' @@ -57,7 +59,7 @@ class SetManager(Set): Only special functionality beyond that provided by set() is to normalize constants for comparison (lower case, no periods) - when they are add()ed and remove()d and allow passing multiple + when they are add()ed and remove()d and allow passing multiple string arguments to the :py:func:`add()` and :py:func:`remove()` methods. ''' @@ -125,7 +127,7 @@ def remove(self, *strings): class TupleManager(dict): ''' - A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants + A dictionary with dot.notation access. Subclass of ``dict``. Makes the tuple constants more friendly. ''' @@ -148,23 +150,25 @@ class Constants(object): """ An instance of this class hold all of the configuration constants for the parser. - :param set prefixes: + :param set prefixes: + :py:attr:`prefixes` wrapped with :py:class:`SetManager`. + :param set family prefixes: :py:attr:`prefixes` wrapped with :py:class:`SetManager`. - :param set titles: + :param set titles: :py:attr:`titles` wrapped with :py:class:`SetManager`. - :param set first_name_titles: + :param set first_name_titles: :py:attr:`~titles.FIRST_NAME_TITLES` wrapped with :py:class:`SetManager`. - :param set suffix_acronyms: + :param set suffix_acronyms: :py:attr:`~suffixes.SUFFIX_ACRONYMS` wrapped with :py:class:`SetManager`. - :param set suffix_not_acronyms: + :param set suffix_not_acronyms: :py:attr:`~suffixes.SUFFIX_NOT_ACRONYMS` wrapped with :py:class:`SetManager`. - :param set conjunctions: + :param set conjunctions: :py:attr:`conjunctions` wrapped with :py:class:`SetManager`. :type capitalization_exceptions: tuple or dict - :param capitalization_exceptions: + :param capitalization_exceptions: :py:attr:`~capitalization.CAPITALIZATION_EXCEPTIONS` wrapped with :py:class:`TupleManager`. :type regexes: tuple or dict - :param regexes: + :param regexes: :py:attr:`regexes` wrapped with :py:class:`TupleManager`. """ @@ -187,9 +191,9 @@ class Constants(object): empty_attribute_default = '' """ Default return value for empty attributes. - + .. doctest:: - + >>> from nameparser.config import CONSTANTS >>> CONSTANTS.empty_attribute_default = None >>> name = HumanName("John Doe") @@ -197,7 +201,7 @@ class Constants(object): None >>>name.first 'John' - + """ capitalize_name = False @@ -233,6 +237,7 @@ class Constants(object): def __init__(self, prefixes=PREFIXES, + family_affixes=AFFIXES, suffix_acronyms=SUFFIX_ACRONYMS, suffix_not_acronyms=SUFFIX_NOT_ACRONYMS, titles=TITLES, @@ -242,6 +247,7 @@ def __init__(self, regexes=REGEXES ): self.prefixes = SetManager(prefixes) + self.family_affixes = SetManager(family_affixes) self.suffix_acronyms = SetManager(suffix_acronyms) self.suffix_not_acronyms = SetManager(suffix_not_acronyms) self.titles = SetManager(titles) diff --git a/nameparser/config/affixes.py b/nameparser/config/affixes.py new file mode 100644 index 0000000..fead9e3 --- /dev/null +++ b/nameparser/config/affixes.py @@ -0,0 +1,123 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +# https://en.wikipedia.org/wiki/List_of_family_name_affixes + +AFFIXES = set([ + 'a', + 'ab', + 'af', + 'av', + 'ap', + 'abu', + 'ait', + 'aït', + 'alam', + 'at', + 'ath', + 'aust', + 'austre', + 'bar', + 'bat', + 'bath', + 'ben', + 'bin', + 'ibn', + 'bert', + 'bet', + 'bint', + 'da', + 'das', + 'de', + 'degli', + 'del', + 'dele', + 'della', + 'den', + 'der', + 'di', + 'dos', + 'du', + 'e', + 'el', + 'fetch', + 'vetch', + 'fitz', + 'i', + 'kil', + 'gil', + 'la', + 'le', + 'lille', + 'lu', + 'm\'', + 'mc', + 'mac', + 'mck', + 'mhic', + 'mic', + 'mala', + 'mellom', + 'myljom', + 'na', + 'ned', + 'nedre', + 'neder', + 'nic', + 'ni', + 'ní', + 'nin', + 'nord', + 'norr', + 'nord', + 'nordre', + 'ny', + 'o', + 'ua', + 'ua', + 'ui', + 'uí', + 'opp', + 'upp', + 'ofver', + 'ost', + 'oster', + 'over', + 'ovste', + 'ovre', + 'oz', + 'pour', + 'putra', + 'putera', + 'putri', + 'putera', + 'setia', + 'setya', + 'stor', + 'soder', + 'sor', + 'sonder', + 'syd', + 'sondre', + 'syndre', + 'sore', + 'ter', + '\'t', + 'tre', + 'van', + 'het', + 'de', + 'vast', + 'väst', + 'vaster', + 'väster', + 'verch', + 'erch', + 'vest', + 'vestre', + 'vesle', + 'vetle', + 'von', + 'war', + 'zu', +]) diff --git a/nameparser/parser.py b/nameparser/parser.py index 5e3f32f..ffde81e 100644 --- a/nameparser/parser.py +++ b/nameparser/parser.py @@ -47,6 +47,8 @@ class HumanName(object): * :py:attr:`suffix` * :py:attr:`nickname` * :py:attr:`surnames` + * :py:attr:`family` + * :py:attr:`family_prefix` :param str full_name: The name string to be parsed. :param constants constants: @@ -300,6 +302,16 @@ def last(self): """ return " ".join(self.last_list) or self.C.empty_attribute_default + @property + def family(self): + """ + The person's family name. + """ + s = "" + for affix, family in self.family_list: + s += " ".join([*affix, *family]) or self.C.empty_attribute_default + return s + @property def suffix(self): """ @@ -399,6 +411,19 @@ def is_prefix(self, piece): else: return lc(piece) in self.C.prefixes + def is_family_affix(self, piece): + """ + Lowercase and no periods version of piece is in the + :py:data:`~nameparser.config.family_affixes.AFFIXES` set. + """ + if isinstance(piece, list): + for item in piece: + if self.is_family_affix(item): + return True + else: + return lc(piece) in self.C.family_affixes + + def is_roman_numeral(self, value): """ Matches the ``roman_numeral`` regular expression in @@ -513,9 +538,9 @@ def parse_nicknames(self): Loops through 3 :py:data:`~nameparser.config.regexes.REGEXES`; `quoted_word`, `double_quotes` and `parenthesis`. """ - + empty_re = re.compile("") - + re_quoted_word = self.C.regexes.quoted_word or empty_re re_double_quotes = self.C.regexes.double_quotes or empty_re re_parenthesis = self.C.regexes.parenthesis or empty_re @@ -563,6 +588,7 @@ def parse_full_name(self): self.first_list = [] self.middle_list = [] self.last_list = [] + self.family_list = [] self.suffix_list = [] self.nickname_list = [] self.unparsable = True @@ -699,6 +725,19 @@ def parse_full_name(self): except IndexError: pass + for last in self.last_list: + if " " in last: + affix = [] + family = [] + for part in last.split(" "): + if self.is_family_affix(part): + affix.append(part) + else: + family.append(part) + self.family_list.append([affix, family]) + else: + self.family_list.append([[], [last]]) + if len(self) < 0: log.info("Unparsable: \"%s\" ", self.original) else: @@ -968,6 +1007,7 @@ def capitalize(self, force=None): self.first_list = self.cap_piece(self.first, 'first').split(' ') self.middle_list = self.cap_piece(self.middle, 'middle').split(' ') self.last_list = self.cap_piece(self.last, 'last').split(' ') + # self.family_list = self.cap_piece(self.family, 'family').split(' ') self.suffix_list = self.cap_piece(self.suffix, 'suffix').split(', ') def handle_capitalization(self): diff --git a/tests.py b/tests.py index 91917a4..b639f7e 100644 --- a/tests.py +++ b/tests.py @@ -187,6 +187,20 @@ def test_prefix_names(self): self.m(hn.first, "vai", hn) self.m(hn.last, "la", hn) + def test_family_name_and_prefix(self): + hn = HumanName("Vincent van Gogh") + self.m(hn.family, "van Gogh", hn) + self.assertEqual(hn.family_list, [ + [["van"], ["Gogh"]] + ]) + + def test_family_name_and_double_prefix(self): + hn = HumanName("Vincent van der Gogh") + self.m(hn.family, "van der Gogh", hn) + self.assertEqual(hn.family_list, [ + [["van", "der"], ["Gogh"]], + ]) + def test_blank_name(self): hn = HumanName() self.m(hn.first, "", hn) pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy