Source code for chemdataextractor.reader.springer

# -*- coding: utf-8 -*-
"""
Readers for documents from Springer.

.. codeauthor:: Callum Court

"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from lxml import etree
import six
from lxml.html import HTMLParser
from ..text import get_encoding
from .markup import HtmlReader, XmlReader
from ..scrape.clean import clean, Cleaner, strip_html
from ..scrape.pub.springer import tidy_springer_references

clean_springer_html = Cleaner(fix_whitespace=True, strip_xpath='.//sub | .//em | .//strong')


[docs]class SpringerMaterialsHtmlReader(HtmlReader):
    """Reader for HTML documents from SpringerMaterials."""

    cleaners = [clean, clean_springer_html]

    root_css = 'html'
    citation_css = 'span[class="CitationRef"]'
    title_css = 'title'
    heading_css = 'h2, h3, h4, h5, h6, .title1, span.title2, span.title3'
    table_css = 'div[class="Table"]'
    table_caption_css = 'div[class="Table"] p'
    table_head_row_css = 'thead'
    table_body_row_css = 'tbody'
    table_cell_css = 'th, td'
    ignore_css = 'sub, sup, em[class^="EmphasisTypeItalic "], li[class="article-metrics__item"], div[class="CitationContent"]'

[docs]    def detect(self, fstring, fname=None):
        """"""
        if fname and not (fname.endswith('.html') or fname.endswith('.htm')):
            return False
        if b'<a class="footer-copyright_link" href="http://www.springernature.com"' in fstring or b'<meta content="SpringerLink"' in fstring:
            return True
        return False

    def _make_tree(self, fstring):
        root = etree.fromstring(fstring, parser=HTMLParser(
            encoding=get_encoding(fstring, guesses='utf-8', is_html=True)))
        return root

[docs]def springer_html_whitespace(document):
    """ Remove whitespace in xml.text or xml.tails for all elements, if it is only whitespace """
    # selects all tags and checks if the text or tail are spaces
    for el in document.xpath('//*'):
        if str(el.text).isspace():
            el.text = ''
        if str(el.tail).isspace():
            el.tail = ''
        
    # debug, check the document
    #print(etree.tostring(document, pretty_print=True))
    # sys.exit()
    return document

[docs]def fix_springer_table_whitespace(document):
    """remove leading and trailing whitespace from table cells
    
    Arguments:
        document {[type]} -- [description]
    
    Returns:
        [type] -- [description]
    """
    for el in document.xpath('.//table//p | .//table//p'):
        if el.text:
            stripped = str(el.text).strip()
            el.text = stripped
    return document

[docs]class SpringerHtmlReader(HtmlReader):

    cleaners = [clean, springer_html_whitespace, clean_springer_html, strip_html, tidy_springer_references, fix_springer_table_whitespace]

    root_css = 'html'
    title_css = 'h1[class^="ArticleTitle"]'
    heading_css = 'h2, h3, h4'
    table_css = 'div[class="Table"]'
    table_caption_css = 'div[class^="Caption"] p'
    table_head_row_css = 'thead tr'
    table_body_row_css = 'tbody tr'
    table_cell_css = 'td, th'
    figure_css = 'figure'
    figure_caption_css = 'figcaption'
    figure_label_css = 'figcaption span[class^="CaptionNumber"]'
    # citation_css = 'ce|bib-reference'
    ignore_css = 'a[class="skip-to__link pseudo-focus"], div[class="nojs-banner u-interface"], a[class="skip-to__link skip-to__link--contents pseudo-focus"],\
                  p[class="leaderboard__label"], div[class="u-screenreader-only"], label[for="search-springerlink"], span[class="search-button__title"],\
                  span[class="u-overflow-ellipsis"], span[class="u-overflow-ellipsis"], a[class="c-button c-button--blue c-button__icon-right gtm-pdf-link"],\
                  div[class="leaderboard u-hide"], title, li[class="article-metrics__item"], aside[class="section section--collapsible"], a[class="gtm-cite-link"],\
                  span[class="u-screenreader-only"], div[class="authors__list"], a[class="gtm-tab-authorsandaffiliations"], ol[class="BibliographyWrapper"],\
                  h2[id="copyrightInformation"], div[class="content authors-affiliations u-interface"], p[class="footer__copyright"], p[class="footer__user-access-info"],\
                  span[class="u-screenreader-only"], a[href="/contactus"], a[class="gtm-footer-accessibility"], ul[class="footer__nav"], div[class="footer__aside-wrapper"],\
                  aside[class="main-sidebar-right u-interface"], a[class="c-button share-this gtm-shareby-sharelink-link test-shareby-sharelink-link"],\
                  a[class="gtm-export-citation"], ul[class="citations__content"], h3[data-role="button-dropdown__title"],\
                  div[class="section section--collapsible uptodate-recommendations gtm-recommendations"], span[class="InlineEquation"], div[class="EquationContent"],\
                  div[class="EquationNumber"], footer'


[docs]    def detect(self, fstring, fname=None):
        """"""
        if fname and not (fname.endswith('.html') or fname.endswith('.htm')):
            return False
        if b'<meta content="Springer US" name="citation_publisher"' in fstring or b'<meta content="SpringerLink"' in fstring:
            print("springer HTML")
            return True
        return False

    def _make_tree(self, fstring):
        root = etree.fromstring(fstring, parser=HTMLParser(
            encoding=get_encoding(fstring, guesses='utf-8', is_html=True)))
        return root