Source code for chemdataextractor.reader.nlm

# -*- coding: utf-8 -*-
"""
Readers for NLM Journal Archiving and Interchange DTD XML files. (i.e. from PubMed Central)

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from ..scrape.clean import clean
from ..scrape.pub.nlm import tidy_nlm_references, space_labels
from .markup import XmlReader


[docs]class NlmXmlReader(XmlReader):
    """Reader for NLM XML documents."""

    cleaners = [clean, tidy_nlm_references, space_labels]

    root_css = 'article'
    title_css = 'front article-meta article-title'
    heading_css = 'title'
    table_css = 'table-wrap'
    table_caption_css = 'caption p'
    table_head_row_css = 'table thead tr'
    table_body_row_css = 'table tbody tr'
    table_footnote_css = 'table-wrap-foot p'
    figure_css = 'fig'
    figure_caption_css = 'caption p'
    reference_css = 'xref'
    citation_css = 'ref-list ref'
    ignore_css = 'xref[ref-type="bibr"], tex-math'

    inline_elements = {
        'b', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'cite', 'code', 'dfn', 'em', 'kbd', 'strong', 'samp', 'var',
        'a', 'bdo', 'br', 'img', 'map', 'object', 'q', 'script', 'span', 'sub', 'sup', 'button', 'input', 'label',
        'select', 'textarea', 'blink', 'font', 'marquee', 'nobr', 's', 'strike', 'u', 'wbr',
        'xref', 'underline', 'italic', 'bold', 'inline-formula', 'alternatives', 'tex-math',
        '{http://www.w3.org/1998/math/mathml}math', '{http://www.w3.org/1998/math/mathml}msubsup',
        '{http://www.w3.org/1998/math/mathml}mrow', '{http://www.w3.org/1998/math/mathml}mo',
        '{http://www.w3.org/1998/math/mathml}mi', '{http://www.w3.org/1998/math/mathml}mn'
    }

[docs]    def detect(self, fstring, fname=None):
        """"""
        if fname and not (fname.endswith('.xml') or fname.endswith('.nxml')):
            return False
        if b'xmlns="http://jats.nlm.nih.gov/ns/archiving' in fstring:
            return True
        if b'JATS-archivearticle1.dtd' in fstring:
            return True
        if b'-//NLM//DTD JATS' in fstring:
            return True
        return False