Source code for chemdataextractor.scrape.clean

# -*- coding: utf-8 -*-
"""
Tools for cleaning up XML/HTML by removing tags entirely or replacing with their contents.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import copy
import logging
import re
from lxml.etree import fromstring, tostring
from lxml.html import fromstring as html_fromstring
import six

from . import BLOCK_ELEMENTS


log = logging.getLogger(__name__)


[docs]class Cleaner(object):
    """Clean HTML or XML by removing tags completely or replacing with their contents.

    A Cleaner instance provides a ``clean_markup`` method::

        cleaner = Cleaner()
        htmlstring = '<html><body><script>alert("test")</script><p>Some text</p></body></html>'
        print(cleaner.clean_markup(htmlstring))

    A Cleaner instance is also a callable that can be applied to lxml document trees::

        tree = lxml.etree.fromstring(htmlstring)
        cleaner(tree)
        print(lxml.etree.tostring(tree))

    Elements that are matched by ``kill_xpath`` are removed entirely, along with their contents. By default,
    ``kill_xpath`` matches all script and style tags, as well as comments and processing instructions.

    Elements that are matched by ``strip_xpath`` are replaced with their contents. By default, no elements are stripped.
    A common use-case is to set ``strip_xpath`` to ``.//*``, which specifies that all elements should be stripped.

    Elements that are matched by ``allow_xpath`` are excepted from stripping, even if they are also matched by
    ``strip_xpath``. This is useful when setting ``strip_xpath`` to strip all tags, allowing a few expections to be
    specified by ``allow_xpath``.
    """

    kill_xpath = './/script | .//style | .//comment() | .//processing-instruction() | .//*[@style="display:none;"]'
    strip_xpath = None
    allow_xpath = None
    fix_whitespace = True
    process_xpaths = {}
    # a dictionary of string: func(el)->el or None which manipulates the text of an element

    namespaces = {
        're': 'http://exslt.org/regular-expressions',
        'set': 'http://exslt.org/sets',
        'dc': 'http://purl.org/dc/elements/1.1/',
        'prism': 'http://prismstandard.org/namespaces/basic/2.0/',
        'xml': 'http://www.w3.org/XML/1998/namespace',
        'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns',
    }

[docs]    def __init__(self, **kwargs):
        """Behaviour can be customized by overriding attributes in a subclass or setting them in the constructor.

        :param string kill_xpath: XPath expression for tags to remove along with their contents.
        :param string strip_xpath: XPath expression for tags to replace with their contents.
        :param string allow_xpath: XPath expression for tags to except from strip_xpath.
        :param bool fix_whitespace: Normalize whitespace to a single space and ensure newlines around block elements.
        :param dict namespaces: Namespace prefixes to register for the XPaths.
        """
        # TODO: This is weird. Why don't we change to proper individual keyword arguments with class attribs as default
        for name, value in kwargs.items():
            if not hasattr(self, name):
                raise TypeError('Unknown parameter: %s=%r' % (name, value))
            setattr(self, name, value)

    def __call__(self, doc):
        """Clean the document."""
        if hasattr(doc, 'getroot'):
            doc = doc.getroot()

        if self.fix_whitespace:
            # Ensure newlines around block elements
            for el in doc.iterdescendants():
                if el.tag in BLOCK_ELEMENTS:
                    el.tail = (el.tail or '') + '\n'
                    previous = el.getprevious()
                    parent = el.getparent()
                    if previous is None:
                        parent.text = (parent.text or '') + '\n'
                    else:
                        previous.tail = (previous.tail or '') + '\n'

        # Remove elements that match kill_xpath
        if self.kill_xpath:
            for el in doc.xpath(self.kill_xpath, namespaces=self.namespaces):
                #log.debug('Killing: %s' % tostring(el))
                parent = el.getparent()
                # We can't kill the root element!
                if parent is None:
                    continue
                if el.tail:
                    previous = el.getprevious()
                    if previous is None:
                        parent.text = (parent.text or '') + el.tail
                    else:
                        previous.tail = (previous.tail or '') + el.tail
                parent.remove(el)

        # Collect all the allowed elements
        to_keep = [el for el in doc.xpath(self.allow_xpath, namespaces=self.namespaces)] if self.allow_xpath else []

        # Replace elements that match strip_xpath with their contents
        if self.strip_xpath:
            for el in doc.xpath(self.strip_xpath, namespaces=self.namespaces):
                # Skip if allowed by allow_xpath
                if el in to_keep:
                    continue
                parent = el.getparent()
                previous = el.getprevious()
                # We can't strip the root element!
                if parent is None:
                    continue
                # Append the text to previous tail (or parent text if no previous), ensuring newline if block level
                if el.text and isinstance(el.tag, six.string_types):
                    if previous is None:
                        parent.text = (parent.text or '') + el.text
                    else:
                        previous.tail = (previous.tail or '') + el.text
                # Append the tail to last child tail, or previous tail, or parent text, ensuring newline if block level
                if el.tail:
                    if len(el):
                        last = el[-1]
                        last.tail = (last.tail or '') + el.tail
                    elif previous is None:
                        parent.text = (parent.text or '') + el.tail
                    else:
                        previous.tail = (previous.tail or '') + el.tail
                index = parent.index(el)
                parent[index:index+1] = el[:]

        for xpath, func in self.process_xpaths.items():
            for el in doc.xpath(xpath, namespaces=self.namespaces):
                parent = el.getparent()
                if parent is None or el in to_keep:
                    continue
                new_element = func(el)
                if new_element is None:
                    parent.remove(el)
                else:
                    parent.replace(el, func(el))

        # Collapse whitespace down to a single space or a single newline
        if self.fix_whitespace:
            for el in doc.iter():
                if el.text is not None:
                    el.text = re.sub(r'\s*\n\s*', '\n', el.text)
                    el.text = re.sub(r'[ \t]+', ' ', el.text)
                    # el.text = re.sub(r'\s+', ' ', el.text)
                if el.tail is not None:
                    el.tail = re.sub(r'\s*\n\s*', '\n', el.tail)
                    el.tail = re.sub(r'[ \t]+', ' ', el.tail)
                    # el.tail = re.sub(r'\s+', ' ', el.tail)

[docs]    def clean_html(self, html):
        """Apply ``Cleaner`` to HTML string or document and return a cleaned string or document."""
        result_type = type(html)
        if isinstance(html, six.string_types):
            doc = html_fromstring(html)
        else:
            doc = copy.deepcopy(html)
        self(doc)
        if issubclass(result_type, six.binary_type):
            return tostring(doc, encoding='utf-8')
        elif issubclass(result_type, six.text_type):
            return tostring(doc, encoding='unicode')
        else:
            return doc

[docs]    def clean_markup(self, markup, parser=None):
        """Apply ``Cleaner`` to markup string or document and return a cleaned string or document."""
        result_type = type(markup)
        if isinstance(markup, six.string_types):
            doc = fromstring(markup, parser=parser)
        else:
            doc = copy.deepcopy(markup)
        self(doc)
        if issubclass(result_type, six.binary_type):
            return tostring(doc, encoding='utf-8')
        elif issubclass(result_type, six.text_type):
            return tostring(doc, encoding='unicode')
        else:
            return doc


#: A default Cleaner instance, which kills comments, processing instructions, script tags, style tags.
clean = Cleaner()

#: Convenience function for applying ``clean`` to a string.
clean_markup = clean.clean_markup

#: Convenience function for applying ``clean`` to a HTML string.
clean_html = clean.clean_html

#: A Cleaner instance that is configured to strip all tags, replacing them with their text contents.
strip = Cleaner(strip_xpath='.//*')

#: Convenience function for applying ``strip`` to a string.
strip_markup = strip.clean_markup

#: Convenience function for applying ``strip`` to a HTML string.
strip_html = strip.clean_html