Source code for chemdataextractor.reader.acs

# -*- coding: utf-8 -*-
"""
Readers for documents from the ACS.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from .markup import HtmlReader
from ..scrape.clean import clean, Cleaner


#: Additional cleaner for ACS HTML  TODO: Move to ignore_css?
clean_acs_html = Cleaner(kill_xpath='.//ul[@class="anchors"] | .//div[@class="citationLinks"]')


[docs]class AcsHtmlReader(HtmlReader): """Reader for HTML documents from the ACS.""" cleaners = [clean, clean_acs_html] root_css = '#articleMain, article' title_css = 'h1.articleTitle' heading_css = 'h2, h3, h4, h5, h6, .title1, span.title2, span.title3' table_css = '.NLM_table-wrap' table_caption_css = '.NLM_caption' table_footnote_css = '.footnote' figure_css = '.figure' figure_caption_css = '.caption' citation_css = '.reference' ignore_css = 'a[href="JavaScript:void(0);"], a.ref sup'
[docs] def detect(self, fstring, fname=None): """""" if fname and not (fname.endswith('.html') or fname.endswith('.htm')): return False if b'<meta name="dc.Identifier" scheme="doi" content="10.1021/' in fstring: return True return False