Source code for chemdataextractor.reader.cssp

# -*- coding: utf-8 -*-
"""
Readers for ChemSpider SyntheticPages.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging

from ..doc.text import Footnote
from ..scrape.pub.rsc import replace_rsc_img_chars
from ..scrape.clean import clean
from .markup import HtmlReader


log = logging.getLogger(__name__)


[docs]class CsspHtmlReader(HtmlReader): """Reader for ChemSpider SyntheticPages HTML documents.""" root_css = '.article-container' title_css = '.article-container > h2' heading_css = 'h3, h4, h5, h6' citation_css = '#csm-article-part-lead_ref > p, #csm-article-part-other_refs > p' def _parse_table_footnotes(self, fns, refs, specials): """Override to account for awkward RSC table footnotes.""" footnotes = [] for fn in fns: footnote = self._parse_text(fn, refs=refs, specials=specials, element_cls=Footnote)[0] footnote += Footnote('', id=fn.getprevious().get('id')) footnotes.append(footnote) return footnotes
[docs] def detect(self, fstring, fname=None): """""" if fname and not (fname.endswith('.html') or fname.endswith('.htm')): return False if b'meta name="DC.Publisher" content="ChemSpider SyntheticPages"' in fstring: return True return False