Source code for chemdataextractor.reader.nlm

# -*- coding: utf-8 -*-
"""
Readers for NLM Journal Archiving and Interchange DTD XML files. (i.e. from PubMed Central)

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from ..scrape.clean import clean
from ..scrape.pub.nlm import tidy_nlm_references, space_labels
from .markup import XmlReader


[docs]class NlmXmlReader(XmlReader): """Reader for NLM XML documents.""" cleaners = [clean, tidy_nlm_references, space_labels] root_css = 'article' title_css = 'front article-meta article-title' heading_css = 'title' table_css = 'table-wrap' table_caption_css = 'caption p' table_head_row_css = 'table thead tr' table_body_row_css = 'table tbody tr' table_footnote_css = 'table-wrap-foot p' figure_css = 'fig' figure_caption_css = 'caption p' reference_css = 'xref' citation_css = 'ref-list ref' ignore_css = 'xref[ref-type="bibr"], tex-math' inline_elements = { 'b', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'cite', 'code', 'dfn', 'em', 'kbd', 'strong', 'samp', 'var', 'a', 'bdo', 'br', 'img', 'map', 'object', 'q', 'script', 'span', 'sub', 'sup', 'button', 'input', 'label', 'select', 'textarea', 'blink', 'font', 'marquee', 'nobr', 's', 'strike', 'u', 'wbr', 'xref', 'underline', 'italic', 'bold', 'inline-formula', 'alternatives', 'tex-math', '{http://www.w3.org/1998/math/mathml}math', '{http://www.w3.org/1998/math/mathml}msubsup', '{http://www.w3.org/1998/math/mathml}mrow', '{http://www.w3.org/1998/math/mathml}mo', '{http://www.w3.org/1998/math/mathml}mi', '{http://www.w3.org/1998/math/mathml}mn' }
[docs] def detect(self, fstring, fname=None): """""" if fname and not (fname.endswith('.xml') or fname.endswith('.nxml')): return False if b'xmlns="http://jats.nlm.nih.gov/ns/archiving' in fstring: return True if b'JATS-archivearticle1.dtd' in fstring: return True if b'-//NLM//DTD JATS' in fstring: return True return False