Source code for chemdataextractor.reader.acs

# -*- coding: utf-8 -*-
"""
Readers for documents from the ACS.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from .markup import HtmlReader
from ..scrape.clean import clean, Cleaner


#: Additional cleaner for ACS HTML  TODO: Move to ignore_css?
clean_acs_html = Cleaner(kill_xpath='.//ul[@class="anchors"] | .//div[@class="citationLinks"]')


[docs]class AcsHtmlReader(HtmlReader):
    """Reader for HTML documents from the ACS."""

    cleaners = [clean, clean_acs_html]

    root_css = '#articleMain, article'
    title_css = 'h1.articleTitle'
    heading_css = 'h2, h3, h4, h5, h6, .title1, span.title2, span.title3'
    table_css = '.NLM_table-wrap'
    table_caption_css = '.NLM_caption'
    table_footnote_css = '.footnote'
    figure_css = '.figure'
    figure_caption_css = '.caption'
    citation_css = '.reference'
    ignore_css = 'a[href="JavaScript:void(0);"], a.ref sup'

[docs]    def detect(self, fstring, fname=None):
        """"""
        if fname and not (fname.endswith('.html') or fname.endswith('.htm')):
            return False
        if b'<meta name="dc.Identifier" scheme="doi" content="10.1021/' in fstring:
            return True
        return False