Source code for chemdataextractor.reader.cssp

# -*- coding: utf-8 -*-
"""
Readers for ChemSpider SyntheticPages.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging

from ..doc.text import Footnote
from ..scrape.pub.rsc import replace_rsc_img_chars
from ..scrape.clean import clean
from .markup import HtmlReader


log = logging.getLogger(__name__)


[docs]class CsspHtmlReader(HtmlReader):
    """Reader for ChemSpider SyntheticPages HTML documents."""

    root_css = '.article-container'
    title_css = '.article-container > h2'
    heading_css = 'h3, h4, h5, h6'
    citation_css = '#csm-article-part-lead_ref > p, #csm-article-part-other_refs > p'

    def _parse_table_footnotes(self, fns, refs, specials):
        """Override to account for awkward RSC table footnotes."""
        footnotes = []
        for fn in fns:
            footnote = self._parse_text(fn, refs=refs, specials=specials, element_cls=Footnote)[0]
            footnote += Footnote('', id=fn.getprevious().get('id'))
            footnotes.append(footnote)
        return footnotes

[docs]    def detect(self, fstring, fname=None):
        """"""
        if fname and not (fname.endswith('.html') or fname.endswith('.htm')):
            return False
        if b'meta name="DC.Publisher" content="ChemSpider SyntheticPages"' in fstring:
            return True
        return False