Source code for chemdataextractor.text.normalize

# -*- coding: utf-8 -*-
"""
Tools for normalizing text.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from abc import ABCMeta, abstractmethod
import re
import unicodedata

import six

from . import CONTROLS, HYPHENS, QUOTES, DOUBLE_QUOTES, ACCENTS, SINGLE_QUOTES, APOSTROPHES, SLASHES, TILDES, MINUSES
from .processors import BaseProcessor


[docs]class BaseNormalizer(six.with_metaclass(ABCMeta, BaseProcessor)):
    """Abstract normalizer class from which all normalizers inherit.

    Subclasses must implement a ``normalize()`` method.
    """

[docs]    @abstractmethod
    def normalize(self, text):
        """Normalize the text.

        :param string text: The text to normalize.
        :returns: Normalized text.
        :rtype: string
        """
        return text

    def __call__(self, text):
        """Calling a normalizer instance like a function just calls the normalize method."""
        return self.normalize(text)


[docs]class Normalizer(BaseNormalizer):
    """Main Normalizer class for generic English text.

    Normalize unicode, hyphens, quotes, whitespace.

    By default, the normal form NFKC is used for unicode normalization. This applies a compatibility decomposition,
    under which equivalent characters are unified, followed by a canonical composition. See Python docs for information
    on normal forms: http://docs.python.org/2/library/unicodedata.html#unicodedata.normalize
    """

[docs]    def __init__(self, form='NFKC', strip=True, collapse=True, hyphens=False, quotes=False, ellipsis=False,
                 slashes=False, tildes=False):
        """

        :param string form: Normal form for unicode normalization.
        :param bool strip: Whether to strip whitespace from start and end.
        :param bool collapse: Whether to collapse all whitespace (tabs, newlines) down to single spaces.
        :param bool hyphens: Whether to normalize all hyphens, minuses and dashes to the ASCII hyphen-minus character.
        :param bool quotes: Whether to normalize all apostrophes, quotes and primes to the ASCII quote character.
        :param bool ellipsis: Whether to normalize ellipses to three full stops.
        :param bool slashes: Whether to normalize slash characters to the ASCII slash character.
        :param bool tildes: Whether to normalize tilde characters to the ASCII tilde character.
        """
        self.form = form
        self.strip = strip
        self.collapse = collapse
        self.hyphens = hyphens
        self.quotes = quotes
        self.ellipsis = ellipsis
        self.slashes = slashes
        self.tildes = tildes

[docs]    def normalize(self, text):
        """Run the Normalizer on a string.

        :param text: The string to normalize.
        """
        # Normalize to canonical unicode (using NFKC by default)
        if self.form is not None:
            text = unicodedata.normalize(self.form, text)

        # Strip out any control characters (they occasionally creep in somehow)
        for control in CONTROLS:
            text = text.replace(control, '')

        # Normalize unusual whitespace not caught by unicodedata
        text = text.replace('\u000b', ' ').replace('\u000c', ' ').replace(u'\u0085', ' ')
        text = text.replace('\u2028', '\n').replace('\u2029', '\n').replace('\r\n', '\n').replace('\r', '\n')

        # Normalize all hyphens, minuses and dashes to ascii hyphen-minus and remove soft hyphen entirely
        if self.hyphens:
            # TODO: Better normalization of em/en dashes to '--' if surrounded by spaces or start/end?
            for hyphen in HYPHENS | MINUSES:
                text = text.replace(hyphen, '-')
            text = text.replace('\u00ad', '')

        # Normalize all quotes and primes to ascii apostrophe and quotation mark
        if self.quotes:
            for double_quote in DOUBLE_QUOTES:
                text = text.replace(double_quote, '"')  # \u0022
            for single_quote in (SINGLE_QUOTES | APOSTROPHES | ACCENTS):
                text = text.replace(single_quote, "'")  # \u0027
            text = text.replace('′', "'")     # \u2032 prime
            text = text.replace('‵', "'")     # \u2035 reversed prime
            text = text.replace('″', "''")    # \u2033 double prime
            text = text.replace('‶', "''")    # \u2036 reversed double prime
            text = text.replace('‴', "'''")   # \u2034 triple prime
            text = text.replace('‷', "'''")   # \u2037 reversed triple prime
            text = text.replace('⁗', "''''")  # \u2057 quadruple prime

        if self.ellipsis:
            text = text.replace('…', '...').replace(' . . . ', ' ... ')  # \u2026

        if self.slashes:
            for slash in SLASHES:
                text = text.replace(slash, '/')

        if self.tildes:
            for tilde in TILDES:
                text = text.replace(tilde, '~')

        if self.strip:
            text = text.strip()

        # Collapse all whitespace down to a single space
        if self.collapse:
            text = ' '.join(text.split())
        return text

#: Default normalize that canonicalizes unicode and fixes whitespace.
normalize = Normalizer(strip=True, collapse=True, hyphens=False, quotes=False, ellipsis=False)
#: More aggressive normalize that also standardizes hyphens, and quotes.
strict_normalize = Normalizer(strip=True, collapse=True, hyphens=True, quotes=True, ellipsis=True, tildes=True)


[docs]class ExcessNormalizer(Normalizer):
    """Excessive string normalization.

    This is useful when doing fuzzy string comparisons. A common use case is to run this before calculating the
    Levenshtein distance between two strings, so that only "important" differences are counted.
    """

[docs]    def __init__(self, form='NFKC', strip=True, collapse=True, hyphens=True, quotes=True, ellipsis=True, tildes=True):
        """"""
        super(ExcessNormalizer, self).__init__(form, strip=strip, collapse=collapse, hyphens=hyphens, quotes=quotes,
                                               ellipsis=ellipsis, tildes=tildes)

[docs]    def normalize(self, text):
        # Lowercase and normalize unicode
        text = super(ExcessNormalizer, self).normalize(text.lower())
        # Remove all whitespace
        text = ''.join(text.split())
        # Convert all apostrophes, quotes, accents, primes to single ascii apostrophe
        for quote in QUOTES:
            text = text.replace(quote, "'")
        # Convert all brackets to regular parentheses
        for ob in {'(', '<', '[', '{', '&lt;'}:
            text = text.replace(ob, '(')
        for cb in {')', '>', ']', '}', '&gt;'}:
            text = text.replace(cb, '(')
        return text


excess_normalize = ExcessNormalizer(strip=True, collapse=True, hyphens=True, quotes=True, ellipsis=True, tildes=True)


[docs]class ChemNormalizer(Normalizer):
    """Normalizer that also unifies chemical spelling."""

[docs]    def __init__(self, form='NFKC', strip=True, collapse=True, hyphens=True, quotes=True, ellipsis=True, tildes=True,
                 chem_spell=True):
        """"""
        super(ChemNormalizer, self).__init__(form, strip=strip, collapse=collapse, hyphens=hyphens, quotes=quotes,
                                             ellipsis=ellipsis, tildes=tildes)
        self.chem_spell = chem_spell

[docs]    def normalize(self, text):
        """Normalize unicode, hyphens, whitespace, and some chemistry terms and formatting."""
        text = super(ChemNormalizer, self).normalize(text)
        # Normalize element spelling
        if self.chem_spell:
            text = re.sub(r'sulph', r'sulf', text, flags=re.I)
            text = re.sub(r'aluminum', r'aluminium', text, flags=re.I)
            text = re.sub(r'cesium', r'caesium', text, flags=re.I)
        return text


chem_normalize = ChemNormalizer(strip=True, collapse=True, hyphens=True, quotes=True, ellipsis=True, tildes=True,
                                chem_spell=True)