Source code for chemdataextractor.biblio.person

# -*- coding: utf-8 -*-
"""
Tools for parsing people's names from strings into various name components.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import re
import string

from ..text import QUOTES
from ..text.latex import latex_to_unicode


ORCID_RE = re.compile(r'^\d{4}-\d{4}-\d{4}-\d{4}$')


TITLES = {
    'ms', 'miss', 'mrs', 'mr', 'master', 'dr', 'doctor', 'prof', 'professor', 'sir', 'dame', 'madam', 'madame',
    'mademoiselle', 'monsieur', 'lord', 'lady', 'rev', 'reverend', 'fr', 'father', 'brother', 'sister', 'pastor',
    'cardinal', 'abbot', 'abbess', 'friar', 'mother', 'bishop', 'archbishop', 'priest', 'priestess', 'pope', 'vicar',
    'chaplain', 'saint', 'deacon', 'archdeacon', 'rabbi', 'ayatollah', 'imam', 'pres', 'president', 'gov', 'governor',
    'rep', 'representative', 'sen', 'senator', 'minister', 'chancellor', 'cllr', 'councillor', 'secretary', 'speaker',
    'alderman', 'delegate', 'mayor', 'ambassador', 'prefect', 'premier', 'envoy', 'provost', 'coach', 'principal',
    'king', 'queen', 'prince', 'princess', 'royal', 'majesty', 'highness', 'rt', 'duke', 'duchess', 'archduke',
    'archduchess', 'marquis', 'marquess', 'marchioness', 'earl', 'count', 'countess', 'viscount', 'viscountess',
    'baron', 'baroness', 'sheikh', 'emperor', 'empress', 'tsar', 'tsarina', 'uncle', 'auntie', 'aunt', 'atty',
    'attorney', 'advocate', 'judge', 'solicitor', 'barrister', 'comptroller', 'sheriff', 'registrar', 'treasurer',
    'associate', 'assistant', 'honorable', 'honourable', 'deputy', 'vice', 'executive', 'his', 'her', 'private',
    'corporal', 'sargent', 'seargent', 'officer', 'major', 'captain', 'commander', 'lieutenant', 'colonel', 'general',
    'chief', 'admiral', 'pilot', 'resident', 'surgeon', 'nurse', 'col', 'capt', 'cpt', 'maj', 'cpl', 'ltc', 'sgt',
    'pfc', 'sfc', 'mg', 'bg', 'ssgt', 'ltcol', 'majgen', 'gen', 'ltgen', 'sgtmaj', 'bgen', 'lcpl', '2ndlt', '1stlt',
    'briggen', '1stsgt', 'pvt', '2lt', '1lt', 'ens', 'lt', 'adm', 'vadm', 'cpo', 'mcpo', 'mcpoc', 'scpo', 'radm(lh)',
    'radm(uh)', 'ltg'
}

PREFIXES = {
    'abu', 'bon', 'bin', 'da', 'dal', 'de', 'del', 'der', 'de', 'di', 'dí', 'ibn', 'la', 'le', 'san', 'st', 'ste',
    'van', 'vel', 'von'
}

SUFFIXES = {
    'Esq', 'Esquire', 'Bt', 'Btss', 'Jr', 'Sr', '2', 'I', 'II', 'III', 'IV', 'V', 'CLU', 'ChFC', 'CFP', 'MP', 'MSP',
    'MEP', 'AM', 'MLA', 'QC', 'KC', 'PC', 'SCJ', 'MHA', 'MNA', 'MPP', 'VC', 'GC', 'KBE', 'CBE', 'MBE', 'DBE', 'GBE',
    'OBE', 'MD', 'PhD', 'DBEnv', 'DConstMgt', 'DREst', 'EdD', 'DPhil', 'DLitt', 'DSocSci', 'EngD', 'DD', 'LLD', 'DProf',
    'BA', 'BSc', 'LLB', 'BEng', 'MBChB', 'MA', 'MSc', 'MSci', 'MPhil', 'MArch', 'MMORSE', 'MMath', 'MMathStat',
    'MPharm', 'MSt', 'MRes', 'MEng', 'MChem', 'MSocSc', 'MMus', 'LLM', 'BCL', 'MPhys', 'MComp', 'MAcc', 'MFin', 'MBA',
    'MPA', 'MEd', 'MEnt', 'MCGI', 'MGeol', 'MLitt', 'MEarthSc', 'MClinRes', 'MJur', 'FdA', 'FdSc', 'FdEng', 'PgD',
    'PgDip', 'PgC', 'PgCert', 'DipHE', 'OND', 'CertHE', 'RA', 'FRCP', 'FRSC', 'FRSA', 'FRCS', 'FMedSci', 'AMSB',
    'MSB', 'FSB', 'FBA', 'FBCS', 'FCPS', 'FGS', 'FREng', 'FRS', 'FRAeS', 'FRAI', 'FRAS', 'MRCP', 'MRCS', 'MRCA', 'FRCA',
    'MRCGP', 'FRCGP', 'MRSC', 'MRPharmS', 'FRPharmS', 'FZS', 'FRES', 'CBiol', 'CChem', 'CEng', 'CMath', 'CPhys', 'CSci'
}

SUFFIXES_LOWER = {suf.lower() for suf in SUFFIXES}

NOT_SUFFIX = {'I.', 'V.'}


# Make attributes instead of dict style.
# Parse from string as a class method.
# updatable attributes that can be set via constructor or modified at any time.
# to_dict, to_json method?


[docs]class PersonName(dict):
    """Class for parsing a person's name into its constituent parts.

    Parses a name string into title, firstname, middlename, nickname, prefix, lastname, suffix.

    Example usage::

        p = PersonName('von Beethoven, Ludwig')

    PersonName acts like a dict::

        print p
        print p['firstname']
        print json.dumps(p)

    Name components can also be access as attributes::

        print p.lastname

    Instances can be reused by setting the name property::

        p.name = 'Henry Ford Jr. III'
        print p

    Two PersonName objects are equal if every name component matches exactly. For fuzzy matching, use the `could_be`
    method. This returns True for names that are not explicitly inconsistent.

    This class was written with the intention of parsing BibTeX author names, so name components enclosed within curly
    brackets will not be split.

    """

    # Useful info at  http://nwalsh.com/tex/texhelp/bibtx-23.html

    # Issues:
    # - Prefix 'ben' is recognised as middlename. Could distinguish 'ben' and 'Ben'?
    # - Multiple word first names like "Emma May" or "Billy Joe" aren't supported

[docs]    def __init__(self, fullname=None, from_bibtex=False):
        """Initialize with a name string.

        :param str fullname: The person's name.
        :param bool from_bibtex: (Optional) Whether the fullname parameter is in BibTeX format. Default False.
        """
        super(PersonName, self).__init__()
        self._from_bibtex = from_bibtex
        self.fullname = fullname

    def __repr__(self):
        return '%s(%r)' % (self.__class__.__name__, self.fullname)

    def __str__(self):
        return dict.__repr__(self)

[docs]    def could_be(self, other):
        """Return True if the other PersonName is not explicitly inconsistent."""
        # TODO: Some suffix and title differences should be allowed
        if type(other) is not type(self):
            return NotImplemented
        if self == other:
            return True
        for attr in ['title', 'firstname', 'middlename', 'nickname', 'prefix', 'lastname', 'suffix']:
            if attr not in self or attr not in other:
                continue
            puncmap = dict((ord(char), None) for char in string.punctuation)
            s = self[attr].lower().translate(puncmap)
            o = other[attr].lower().translate(puncmap)
            if s == o:
                continue
            if attr in {'firstname', 'middlename', 'lastname'}:
                if (({len(comp) for comp in s.split()} == {1} and [el[0] for el in o.split()] == s.split()) or
                        ({len(comp) for comp in o.split()} == {1} and [el[0] for el in s.split()] == o.split())):
                    continue
            return False
        return True

    @property
    def fullname(self):
        return self.get('fullname', '')

    @fullname.setter
    def fullname(self, fullname):
        self.clear()
        self._parse(fullname)

    def __getattr__(self, name):
        if name in {'title', 'firstname', 'middlename', 'nickname', 'prefix', 'lastname', 'suffix'}:
            return self.get(name)
        else:
            raise AttributeError

    def _is_title(self, t):
        """Return true if t is a title."""
        return t.lower().replace('.', '') in TITLES

    def _is_prefix(self, t):
        """Return true if t is a prefix."""
        return t.lower().replace('.', '') in PREFIXES

    def _is_suffix(self, t):
        """Return true if t is a suffix."""
        return t not in NOT_SUFFIX and (t.replace('.', '') in SUFFIXES or t.replace('.', '') in SUFFIXES_LOWER)

    def _tokenize(self, comps):
        """Split name on spaces, unless inside curly brackets or quotes."""
        ps = []
        for comp in comps:
            ps.extend([c.strip(' ,') for c in re.split(r'\s+(?=[^{}]*(?:\{|$))', comp)])
        return [p for p in ps if p]

    def _clean(self, t, capitalize=None):
        """Convert to normalized unicode and strip trailing full stops."""
        if self._from_bibtex:
            t = latex_to_unicode(t, capitalize=capitalize)
        t = ' '.join([el.rstrip('.') if el.count('.') == 1 else el for el in t.split()])
        return t

    def _strip(self, tokens, criteria, prop, rev=False):
        """Strip off contiguous tokens from the start or end of the list that meet the criteria."""
        num = len(tokens)
        res = []
        for i, token in enumerate(reversed(tokens) if rev else tokens):
            if criteria(token) and num > i + 1:
                res.insert(0, tokens.pop()) if rev else res.append(tokens.pop(0))
            else:
                break
        if res:
            self[prop] = self._clean(' '.join(res))
        return tokens

    def _parse(self, fullname):
        """Perform the parsing."""
        n = ' '.join(fullname.split()).strip(',')
        if not n:
            return
        comps = [p.strip() for p in n.split(',')]
        if len(comps) > 1 and not all([self._is_suffix(comp) for comp in comps[1:]]):
            vlj = []
            while True:
                vlj.append(comps.pop(0))
                if not self._is_suffix(comps[0]):
                    break
            ltokens = self._tokenize(vlj)
            ltokens = self._strip(ltokens, self._is_prefix, 'prefix')
            ltokens = self._strip(ltokens, self._is_suffix, 'suffix', True)
            self['lastname'] = self._clean(' '.join(ltokens), capitalize='name')
        tokens = self._tokenize(comps)
        tokens = self._strip(tokens, self._is_title, 'title')
        if not 'lastname' in self:
            tokens = self._strip(tokens, self._is_suffix, 'suffix', True)
        voni = []
        end = len(tokens) - 1
        if not 'prefix' in self:
            for i, token in enumerate(reversed(tokens)):
                if self._is_prefix(token):
                    if (i == 0 and end > 0) or (not 'lastname' in self and not i == end):
                        voni.append(end - i)
                else:
                    if (i == 0 and 'lastname' in self) or voni:
                        break
        if voni:
            if not 'lastname' in self:
                self['lastname'] = self._clean(' '.join(tokens[voni[0]+1:]), capitalize='name')
            self['prefix'] = self._clean(' '.join(tokens[voni[-1]:voni[0]+1]))
            tokens = tokens[:voni[-1]]
        else:
            if not 'lastname' in self:
                self['lastname'] = self._clean(tokens.pop(), capitalize='name')
        if tokens:
            self['firstname'] = self._clean(tokens.pop(0), capitalize='name')
        if tokens:
            nicki = []
            for i, token in enumerate(tokens):
                if token[0] in QUOTES:
                    for j, token2 in enumerate(tokens[i:]):
                        if token2[-1] in QUOTES:
                            nicki = range(i, i+j+1)
                            break
            if nicki:
                self['nickname'] = self._clean(' '.join(tokens[nicki[0]:nicki[-1]+1]).strip(''.join(QUOTES)),
                                                     capitalize='name')
                tokens[nicki[0]:nicki[-1]+1] = []
        if tokens:
            self['middlename'] = self._clean(' '.join(tokens), capitalize='name')
        namelist = []
        for attr in ['title', 'firstname', 'middlename', 'nickname', 'prefix', 'lastname', 'suffix']:
            if attr in self:
                namelist.append('"%s"' % self[attr] if attr == 'nickname' else self[attr])
        self['fullname'] = ' '.join(namelist)