Source code for chemdataextractor.reader.rsc

# -*- coding: utf-8 -*-
"""
Readers for documents from the RSC.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging

from ..doc.text import Footnote, Caption
from ..scrape.pub.rsc import replace_rsc_img_chars
from ..scrape.clean import clean, Cleaner
from .markup import HtmlReader
from ..doc.table import Table
from lxml import etree


log = logging.getLogger(__name__)

# XML stripper that removes the tags around numbers in chemical formulas
strip_rsc_html = Cleaner(strip_xpath='.//b', kill_xpath=".//p[contains(@class, 'header_text')] | //*[@id='art-admin']")


[docs]def rsc_html_whitespace(document):
    """ Remove whitespace in xml.text or xml.tails for all elements, if it is only whitespace """
    # selects all tags and checks if the text or tail are spaces
    for el in document.xpath('//*'):
        if el.tag == 'b':
            continue
        if str(el.text).isspace():
            el.text = ''
        if str(el.tail).isspace():
            el.tail = ''
        if el.text:
            el.text = el.text.replace('\n', ' ')
    return document


[docs]def join_rsc_table_captions(document):
    """Add wrapper tag around Tables and their respective captions

    Arguments:
        document {[type]} -- [description]
    """
    for el in document.xpath('//div[@class="table_caption"]'):
        next_el = el.getnext()
        if next_el.tag == 'div' and next_el.attrib['class'] == 'rtable__wrapper':
            caption_el = el
            table_el = next_el
            table_el.insert(0, caption_el)
    return document


[docs]class RscHtmlReader(HtmlReader):
    """Reader for HTML documents from the RSC."""

    cleaners = [clean, rsc_html_whitespace, replace_rsc_img_chars, join_rsc_table_captions, strip_rsc_html]

    root_css = 'html'
    title_css = 'h1, .title_heading'
    heading_css = 'h2, h3, h4, h5, h6, .a_heading, .b_heading, .c_heading, .c_heading_indent, .d_heading, .d_heading_indent'
    citation_css = 'span[id^="cit"]'
    table_css = 'div[class^="rtable__wrapper"]'
    table_caption_css = '.table_caption'
    table_head_row_css = 'table thead tr'
    table_body_row_css = 'table tbody tr'
    table_footnote_css = 'table tfoot tr th .sup_inf'
    reference_css = 'small sup a, a[href^="#cit"], a[href^="#fn"], a[href^="#tab"]'
    figure_css = '.image_table'
    figure_caption_css = '.graphic_title'
    figure_label_css = 'td.image_title b'
    figure_download_link_css = 'img::attr(src)'
    ignore_css = '.table_caption + table, .left_head, sup span.sup_ref, small sup a, a[href^="#fn"], .PMedLink'

    def _parse_table_footnotes(self, fns, refs, specials):
        """Override to account for awkward RSC table footnotes."""
        footnotes = []
        for fn in fns:
            footnote = self._parse_text(fn, refs=refs, specials=specials, element_cls=Footnote)[0]
            footnote += Footnote('', id=fn.getprevious().get('id'))
            footnotes.append(footnote)
        return footnotes

[docs]    def detect(self, fstring, fname=None):
        """"""
        if fname and not (fname.endswith('.html') or fname.endswith('.htm')):
            return False
        if b'meta name="citation_doi" content="10.1039' in fstring or b'meta content="Royal Society of Chemistry"' in fstring:
            return True
        return False