# -*- coding: utf-8 -*-
"""
Text processors.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from abc import ABCMeta, abstractmethod
import logging
import re
import six
from . import EMAIL_RE, APOSTROPHES
log = logging.getLogger(__name__)
[docs]class BaseProcessor(six.with_metaclass(ABCMeta)):
"""Abstract processor class from which all processors inherit. Subclasses must implement a ``__call__()`` method."""
@abstractmethod
def __call__(self, text):
"""Process the text.
:param string text: The input text.
:returns: The processed text or None.
:rtype: string or None
"""
return text
[docs]class Chain(object):
"""Apply a series of processors in turn. Stops if a processors returns None."""
[docs] def __init__(self, *callables):
self.callables = callables
def __call__(self, value):
for func in self.callables:
if value is None:
break
value = func(value)
return value
[docs]class Discard(object):
"""Return None if value matches a string."""
[docs] def __init__(self, *match):
self.match = match
def __call__(self, value):
if value in self.match:
return None
return value
[docs]class LAdd(object):
"""Add a substring to the start of a value."""
[docs] def __init__(self, substring):
self.substring = substring
def __call__(self, value):
return '%s%s' % (self.substring, value)
[docs]class RAdd(object):
"""Add a substring to the end of a value."""
[docs] def __init__(self, substring):
self.substring = substring
def __call__(self, value):
return '%s%s' % (value, self.substring)
[docs]class LStrip(object):
"""Remove a substring from the start of a value."""
[docs] def __init__(self, *substrings):
self.substrings = substrings
def __call__(self, value):
for substring in self.substrings:
if value.startswith(substring):
return value[len(substring):]
return value
[docs]class RStrip(object):
"""Remove a substring from the end of a value."""
[docs] def __init__(self, *substrings):
self.substrings = substrings
def __call__(self, value):
for substring in self.substrings:
if value.endswith(substring):
return value[:-len(substring)]
return value
[docs]def floats(s):
"""Convert string to float. Handles more string formats that the standard python conversion."""
try:
return float(s)
except ValueError:
s = re.sub(r'(\d)\s*\(\d+(\.\d+)?\)', r'\1', s) # Remove bracketed numbers from end
s = re.sub(r'(\d)\s*±\s*\d+(\.\d+)?', r'\1', s) # Remove uncertainties from end
s = s.rstrip('\'"+-=<>/,.:;!?)]}…∼~≈×*_≥≤') # Remove trailing punctuation
s = s.lstrip('\'"+=<>/([{∼~≈×*_≥≤£$€#§') # Remove leading punctuation
s = s.replace(',', '') # Remove commas
s = ''.join(s.split()) # Strip whitespace
s = re.sub(r'(\d)\s*[×x]\s*10\^?(-?\d)', r'\1e\2', s) # Convert scientific notation
return float(s)
[docs]def strip_querystring(url):
"""Remove the querystring from the end of a URL."""
p = six.moves.urllib.parse.urlparse(url)
return p.scheme + "://" + p.netloc + p.path
[docs]class Substitutor(object):
"""Perform a list of substitutions defined by regex on text.
Useful to clean up text where placeholders are used in place of actual unicode characters.
"""
[docs] def __init__(self, substitutions):
"""
:param substitutions: List of (regex, string) tuples that define the substitution.
"""
self.substitutions = []
for pattern, replacement in substitutions:
if isinstance(pattern, six.string_types):
pattern = re.compile(pattern, re.I | re.U)
self.substitutions.append((pattern, replacement))
def __call__(self, t):
"""Run substitutions on given text and return it.
:param string t: The text to run substitutions on.
"""
for pattern, replacement in self.substitutions:
t = pattern.sub(replacement, t)
return t
[docs]def unapostrophe(text):
"""Strip apostrophe and 's' from the end of a string."""
text = re.sub(r'[%s]s?$' % ''.join(APOSTROPHES), '', text)
return text