# -*- coding: utf-8 -*-
Tools for parsing people's names from strings into various name components.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import re
import string
from ..text import QUOTES
from ..text.latex import latex_to_unicode
ORCID_RE = re.compile(r'^\d{4}-\d{4}-\d{4}-\d{4}$')
'ms', 'miss', 'mrs', 'mr', 'master', 'dr', 'doctor', 'prof', 'professor', 'sir', 'dame', 'madam', 'madame',
'mademoiselle', 'monsieur', 'lord', 'lady', 'rev', 'reverend', 'fr', 'father', 'brother', 'sister', 'pastor',
'cardinal', 'abbot', 'abbess', 'friar', 'mother', 'bishop', 'archbishop', 'priest', 'priestess', 'pope', 'vicar',
'chaplain', 'saint', 'deacon', 'archdeacon', 'rabbi', 'ayatollah', 'imam', 'pres', 'president', 'gov', 'governor',
'rep', 'representative', 'sen', 'senator', 'minister', 'chancellor', 'cllr', 'councillor', 'secretary', 'speaker',
'alderman', 'delegate', 'mayor', 'ambassador', 'prefect', 'premier', 'envoy', 'provost', 'coach', 'principal',
'king', 'queen', 'prince', 'princess', 'royal', 'majesty', 'highness', 'rt', 'duke', 'duchess', 'archduke',
'archduchess', 'marquis', 'marquess', 'marchioness', 'earl', 'count', 'countess', 'viscount', 'viscountess',
'baron', 'baroness', 'sheikh', 'emperor', 'empress', 'tsar', 'tsarina', 'uncle', 'auntie', 'aunt', 'atty',
'attorney', 'advocate', 'judge', 'solicitor', 'barrister', 'comptroller', 'sheriff', 'registrar', 'treasurer',
'associate', 'assistant', 'honorable', 'honourable', 'deputy', 'vice', 'executive', 'his', 'her', 'private',
'corporal', 'sargent', 'seargent', 'officer', 'major', 'captain', 'commander', 'lieutenant', 'colonel', 'general',
'chief', 'admiral', 'pilot', 'resident', 'surgeon', 'nurse', 'col', 'capt', 'cpt', 'maj', 'cpl', 'ltc', 'sgt',
'pfc', 'sfc', 'mg', 'bg', 'ssgt', 'ltcol', 'majgen', 'gen', 'ltgen', 'sgtmaj', 'bgen', 'lcpl', '2ndlt', '1stlt',
'briggen', '1stsgt', 'pvt', '2lt', '1lt', 'ens', 'lt', 'adm', 'vadm', 'cpo', 'mcpo', 'mcpoc', 'scpo', 'radm(lh)',
'radm(uh)', 'ltg'
'abu', 'bon', 'bin', 'da', 'dal', 'de', 'del', 'der', 'de', 'di', 'dí', 'ibn', 'la', 'le', 'san', 'st', 'ste',
'van', 'vel', 'von'
'Esq', 'Esquire', 'Bt', 'Btss', 'Jr', 'Sr', '2', 'I', 'II', 'III', 'IV', 'V', 'CLU', 'ChFC', 'CFP', 'MP', 'MSP',
'MEP', 'AM', 'MLA', 'QC', 'KC', 'PC', 'SCJ', 'MHA', 'MNA', 'MPP', 'VC', 'GC', 'KBE', 'CBE', 'MBE', 'DBE', 'GBE',
'OBE', 'MD', 'PhD', 'DBEnv', 'DConstMgt', 'DREst', 'EdD', 'DPhil', 'DLitt', 'DSocSci', 'EngD', 'DD', 'LLD', 'DProf',
'BA', 'BSc', 'LLB', 'BEng', 'MBChB', 'MA', 'MSc', 'MSci', 'MPhil', 'MArch', 'MMORSE', 'MMath', 'MMathStat',
'MPharm', 'MSt', 'MRes', 'MEng', 'MChem', 'MSocSc', 'MMus', 'LLM', 'BCL', 'MPhys', 'MComp', 'MAcc', 'MFin', 'MBA',
'MPA', 'MEd', 'MEnt', 'MCGI', 'MGeol', 'MLitt', 'MEarthSc', 'MClinRes', 'MJur', 'FdA', 'FdSc', 'FdEng', 'PgD',
'PgDip', 'PgC', 'PgCert', 'DipHE', 'OND', 'CertHE', 'RA', 'FRCP', 'FRSC', 'FRSA', 'FRCS', 'FMedSci', 'AMSB',
'MSB', 'FSB', 'FBA', 'FBCS', 'FCPS', 'FGS', 'FREng', 'FRS', 'FRAeS', 'FRAI', 'FRAS', 'MRCP', 'MRCS', 'MRCA', 'FRCA',
'MRCGP', 'FRCGP', 'MRSC', 'MRPharmS', 'FRPharmS', 'FZS', 'FRES', 'CBiol', 'CChem', 'CEng', 'CMath', 'CPhys', 'CSci'
SUFFIXES_LOWER = {suf.lower() for suf in SUFFIXES}
NOT_SUFFIX = {'I.', 'V.'}
# Make attributes instead of dict style.
# Parse from string as a class method.
# updatable attributes that can be set via constructor or modified at any time.
# to_dict, to_json method?
[docs]class PersonName(dict):
"""Class for parsing a person's name into its constituent parts.
Parses a name string into title, firstname, middlename, nickname, prefix, lastname, suffix.
Example usage::
p = PersonName('von Beethoven, Ludwig')
PersonName acts like a dict::
print p
print p['firstname']
print json.dumps(p)
Name components can also be access as attributes::
print p.lastname
Instances can be reused by setting the name property::
p.name = 'Henry Ford Jr. III'
print p
Two PersonName objects are equal if every name component matches exactly. For fuzzy matching, use the `could_be`
method. This returns True for names that are not explicitly inconsistent.
This class was written with the intention of parsing BibTeX author names, so name components enclosed within curly
brackets will not be split.
# Useful info at http://nwalsh.com/tex/texhelp/bibtx-23.html
# Issues:
# - Prefix 'ben' is recognised as middlename. Could distinguish 'ben' and 'Ben'?
# - Multiple word first names like "Emma May" or "Billy Joe" aren't supported
[docs] def __init__(self, fullname=None, from_bibtex=False):
"""Initialize with a name string.
:param str fullname: The person's name.
:param bool from_bibtex: (Optional) Whether the fullname parameter is in BibTeX format. Default False.
super(PersonName, self).__init__()
self._from_bibtex = from_bibtex
self.fullname = fullname
def __repr__(self):
return '%s(%r)' % (self.__class__.__name__, self.fullname)
def __str__(self):
return dict.__repr__(self)
[docs] def could_be(self, other):
"""Return True if the other PersonName is not explicitly inconsistent."""
# TODO: Some suffix and title differences should be allowed
if type(other) is not type(self):
return NotImplemented
if self == other:
return True
for attr in ['title', 'firstname', 'middlename', 'nickname', 'prefix', 'lastname', 'suffix']:
if attr not in self or attr not in other:
puncmap = dict((ord(char), None) for char in string.punctuation)
s = self[attr].lower().translate(puncmap)
o = other[attr].lower().translate(puncmap)
if s == o:
if attr in {'firstname', 'middlename', 'lastname'}:
if (({len(comp) for comp in s.split()} == {1} and [el[0] for el in o.split()] == s.split()) or
({len(comp) for comp in o.split()} == {1} and [el[0] for el in s.split()] == o.split())):
return False
return True
def fullname(self):
return self.get('fullname', '')
def fullname(self, fullname):
def __getattr__(self, name):
if name in {'title', 'firstname', 'middlename', 'nickname', 'prefix', 'lastname', 'suffix'}:
return self.get(name)
raise AttributeError
def _is_title(self, t):
"""Return true if t is a title."""
return t.lower().replace('.', '') in TITLES
def _is_prefix(self, t):
"""Return true if t is a prefix."""
return t.lower().replace('.', '') in PREFIXES
def _is_suffix(self, t):
"""Return true if t is a suffix."""
return t not in NOT_SUFFIX and (t.replace('.', '') in SUFFIXES or t.replace('.', '') in SUFFIXES_LOWER)
def _tokenize(self, comps):
"""Split name on spaces, unless inside curly brackets or quotes."""
ps = []
for comp in comps:
ps.extend([c.strip(' ,') for c in re.split(r'\s+(?=[^{}]*(?:\{|$))', comp)])
return [p for p in ps if p]
def _clean(self, t, capitalize=None):
"""Convert to normalized unicode and strip trailing full stops."""
if self._from_bibtex:
t = latex_to_unicode(t, capitalize=capitalize)
t = ' '.join([el.rstrip('.') if el.count('.') == 1 else el for el in t.split()])
return t
def _strip(self, tokens, criteria, prop, rev=False):
"""Strip off contiguous tokens from the start or end of the list that meet the criteria."""
num = len(tokens)
res = []
for i, token in enumerate(reversed(tokens) if rev else tokens):
if criteria(token) and num > i + 1:
res.insert(0, tokens.pop()) if rev else res.append(tokens.pop(0))
if res:
self[prop] = self._clean(' '.join(res))
return tokens
def _parse(self, fullname):
"""Perform the parsing."""
n = ' '.join(fullname.split()).strip(',')
if not n:
comps = [p.strip() for p in n.split(',')]
if len(comps) > 1 and not all([self._is_suffix(comp) for comp in comps[1:]]):
vlj = []
while True:
if not self._is_suffix(comps[0]):
ltokens = self._tokenize(vlj)
ltokens = self._strip(ltokens, self._is_prefix, 'prefix')
ltokens = self._strip(ltokens, self._is_suffix, 'suffix', True)
self['lastname'] = self._clean(' '.join(ltokens), capitalize='name')
tokens = self._tokenize(comps)
tokens = self._strip(tokens, self._is_title, 'title')
if not 'lastname' in self:
tokens = self._strip(tokens, self._is_suffix, 'suffix', True)
voni = []
end = len(tokens) - 1
if not 'prefix' in self:
for i, token in enumerate(reversed(tokens)):
if self._is_prefix(token):
if (i == 0 and end > 0) or (not 'lastname' in self and not i == end):
voni.append(end - i)
if (i == 0 and 'lastname' in self) or voni:
if voni:
if not 'lastname' in self:
self['lastname'] = self._clean(' '.join(tokens[voni[0]+1:]), capitalize='name')
self['prefix'] = self._clean(' '.join(tokens[voni[-1]:voni[0]+1]))
tokens = tokens[:voni[-1]]
if not 'lastname' in self:
self['lastname'] = self._clean(tokens.pop(), capitalize='name')
if tokens:
self['firstname'] = self._clean(tokens.pop(0), capitalize='name')
if tokens:
nicki = []
for i, token in enumerate(tokens):
if token[0] in QUOTES:
for j, token2 in enumerate(tokens[i:]):
if token2[-1] in QUOTES:
nicki = range(i, i+j+1)
if nicki:
self['nickname'] = self._clean(' '.join(tokens[nicki[0]:nicki[-1]+1]).strip(''.join(QUOTES)),
tokens[nicki[0]:nicki[-1]+1] = []
if tokens:
self['middlename'] = self._clean(' '.join(tokens), capitalize='name')
namelist = []
for attr in ['title', 'firstname', 'middlename', 'nickname', 'prefix', 'lastname', 'suffix']:
if attr in self:
namelist.append('"%s"' % self[attr] if attr == 'nickname' else self[attr])
self['fullname'] = ' '.join(namelist)