Source code for chemdataextractor.scrape.pub.nlm

# -*- coding: utf-8 -*-
"""
Tools for scraping documents from NLM Journal Archiving and Interchange DTD XML files.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import re

from ...text.normalize import normalize
from ...text.processors import Chain, Discard
from ..clean import Cleaner
from ..entity import Entity
from ..fields import StringField, EntityField, UrlField, IntField

log = logging.getLogger(__name__)


#: XML stripper that kills reference links, footnote links, equations, footnotes
strip_pmc_xml = Cleaner(strip_xpath='.//*', kill_xpath='.//disp-formula|.//inline-formula|.//mml:math|.//xref[@ref-type="bibr"]|.//xref[@ref-type="p"]|.//xref[@ref-type="fn"]|.//fn|.//private-char', namespaces={'mml': 'http://www.w3.org/1998/Math/MathML'})
#: XML stripper that also kills headings
strip_pmc_abstract_xml = Cleaner(strip_xpath='.//*', kill_xpath='.//disp-formula|.//inline-formula|.//mml:math|.//xref[@ref-type="bibr"]|.//xref[@ref-type="p"]|.//xref[@ref-type="fn"]|.//fn|.//private-char|.//title', namespaces={'mml': 'http://www.w3.org/1998/Math/MathML'})
#: XML stripper that also kills tables and figures
strip_pmc_paragraph_xml = Cleaner(strip_xpath='.//*', kill_xpath='.//disp-formula|.//inline-formula|.//mml:math|.//xref[@ref-type="bibr"]|.//xref[@ref-type="p"]|.//xref[@ref-type="fn"]|.//fn|.//private-char|.//table-wrap|.//fig', namespaces={'mml': 'http://www.w3.org/1998/Math/MathML'})


[docs]def space_labels(document): """Ensure space around bold compound labels.""" for label in document.xpath('.//bold'): # TODO: Make this more permissive to match chemical_label in parser if not label.text or not re.match('^\(L?\d\d?[a-z]?\):?$', label.text, re.I): continue parent = label.getparent() previous = label.getprevious() if previous is None: text = parent.text or '' if not text.endswith(' '): parent.text = text + ' ' else: text = previous.tail or '' if not text.endswith(' '): previous.tail = text + ' ' text = label.tail or '' if not text.endswith(' '): label.tail = text + ' ' return document
[docs]def tidy_nlm_references(document): """Remove punctuation around references like brackets, commas, hyphens.""" def strip_preceding(text): stext = text.rstrip() if stext.endswith('[') or stext.endswith('('): #log.debug('%s -> %s' % (text, stext[:-1])) return stext[:-1] return text def strip_between(text): stext = text.strip() if stext in {',', '-', '\u2013', '\u2212'}: #log.debug('%s -> %s' % (text, '')) return '' return text def strip_following(text): stext = text.lstrip() if stext.startswith(']') or stext.startswith(')'): #log.debug('%s -> %s' % (text, stext[1:])) return stext[1:] return text for ref in document.xpath('.//xref[@ref-type="bibr"]'): parent = ref.getparent() previous = ref.getprevious() next = ref.getnext() if previous is None: parent.text = strip_preceding(parent.text or '') else: previous.tail = strip_preceding(previous.tail or '') if next is not None and next.tag == 'xref' and next.get('ref-type') == 'bibr': ref.tail = strip_between(ref.tail or '') ref.tail = strip_following(ref.tail or '') return document
[docs]class NlmXmlAuthor(Entity): """Author information from NLM XML file.""" givennames = StringField('./name/given-names/text()', xpath=True) lastname = StringField('./name/surname/text()', xpath=True) email = StringField('./email/text()', xpath=True, strip=True) process_givennames = normalize process_lastname = normalize
[docs]class NlmXmlImage(Entity): """Figure information from NLM XML file.""" label = StringField('./label', xpath=True) caption = StringField('./caption', xpath=True) reference = StringField('@id', xpath=True, strip=True) clean_caption = Chain(tidy_nlm_references, strip_pmc_xml) process_caption = normalize
[docs]class NlmXmlTable(Entity): """Table information from NLM XML file.""" label = StringField('./label', xpath=True) caption = StringField('./caption', xpath=True) reference = StringField('@id', xpath=True) src = StringField('.', xpath=True, strip=True, raw=True) clean_caption = Chain(tidy_nlm_references, strip_pmc_xml) process_caption = normalize
[docs]class NlmXmlDocument(Entity): """Document information from a NLM XML file.""" # ui = StringField('/art/ui/text()', xpath=True, strip=True) doi = StringField('/article/front/article-meta/article-id[@pub-id-type="doi"]/text()', xpath=True, lower=True) pmid = IntField('/article/front/article-meta/article-id[@pub-id-type="pmid"]/text()', xpath=True) pmcid = IntField('/article/front/article-meta/article-id[@pub-id-type="pmc"]/text()', xpath=True) title = StringField('/article/front/article-meta//article-title', xpath=True) authors = EntityField(NlmXmlAuthor, '/article/front/article-meta//contrib[@contrib-type="author"]', xpath=True, all=True) journal_title = StringField('/article/front/journal-meta//journal-title/text()', xpath=True) journal_abbreviation = StringField('/article/front/journal-meta/journal-id[@journal-id-type="iso-abbrev"]/text()|/article/front/journal-meta/journal-id[@journal-id-type="nlm-ta"]/text()', xpath=True) publisher = StringField('/article/front/journal-meta//publisher-name/text()', xpath=True) volume = StringField('/article/front/article-meta/volume/text()', xpath=True) firstpage = StringField('/article/front/article-meta/fpage/text()', xpath=True) lastpage = StringField('/article/front/article-meta/lpage/text()', xpath=True) issue = StringField('/article/front/article-meta/issue/text()', xpath=True) issn = StringField('/article/front/journal-meta/issn/text()', xpath=True, all=True) coden = StringField('/article/front/journal-meta/journal-id[@journal-id-type="coden"]/text()', xpath=True, all=True) abstract = StringField('/article/front/article-meta/abstract', xpath=True) online_year = IntField('/article/front/article-meta/pub-date[@pub-type="epub"]/year/text()', xpath=True) online_month = IntField('/article/front/article-meta/pub-date[@pub-type="epub"]/month/text()', xpath=True) online_day = IntField('/article/front/article-meta/pub-date[@pub-type="epub"]/day/text()', xpath=True) published_year = IntField('/article/front/article-meta/pub-date[@pub-type="ppub"]/year/text()', xpath=True) published_month = IntField('/article/front/article-meta/pub-date[@pub-type="ppub"]/month/text()', xpath=True) published_day = IntField('/article/front/article-meta/pub-date[@pub-type="ppub"]/day/text()', xpath=True) accepted_year = IntField('/article/front/article-meta/history/date[@date-type="accepted"]/year/text()', xpath=True) accepted_month = IntField('/article/front/article-meta/history/date[@date-type="accepted"]/month/text()', xpath=True) accepted_day = IntField('/article/front/article-meta/history/date[@date-type="accepted"]/day/text()', xpath=True) received_year = IntField('/article/front/article-meta/history/date[@date-type="received"]/year/text()', xpath=True) received_month = IntField('/article/front/article-meta/history/date[@date-type="received"]/month/text()', xpath=True) received_day = IntField('/article/front/article-meta/history/date[@date-type="received"]/day/text()', xpath=True) license = UrlField('/article/front/article-meta/permissions/license/@xlink:href|/article/front/article-meta/permissions/license//ext-link/@xlink:href', xpath=True) # figures = EntityField(NlmXmlImage, '/article/body//fig', xpath=True, all=True) # tables = EntityField(NlmXmlTable, '/article/body//table-wrap', xpath=True, all=True) # headings = StringField('/article/body//sec/title', xpath=True, all=True) # paragraphs = StringField('/article/body//sec/p', xpath=True, all=True) clean_title = strip_pmc_xml clean_abstract = strip_pmc_abstract_xml # clean_headings = Chain(tidy_nlm_references, strip_pmc_xml) # clean_paragraphs = Chain(tidy_nlm_references, strip_pmc_paragraph_xml) process_title = normalize process_publisher = normalize process_abstract = normalize
# process_headings = normalize # process_paragraphs = Chain(normalize, Discard(''))