# -*- coding: utf-8 -*-
"""
Tools for scraping documents from NLM Journal Archiving and Interchange DTD XML files.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import re
from ...text.normalize import normalize
from ...text.processors import Chain, Discard
from ..clean import Cleaner
from ..entity import Entity
from ..fields import StringField, EntityField, UrlField, IntField
log = logging.getLogger(__name__)
#: XML stripper that kills reference links, footnote links, equations, footnotes
strip_pmc_xml = Cleaner(strip_xpath='.//*', kill_xpath='.//disp-formula|.//inline-formula|.//mml:math|.//xref[@ref-type="bibr"]|.//xref[@ref-type="p"]|.//xref[@ref-type="fn"]|.//fn|.//private-char', namespaces={'mml': 'http://www.w3.org/1998/Math/MathML'})
#: XML stripper that also kills headings
strip_pmc_abstract_xml = Cleaner(strip_xpath='.//*', kill_xpath='.//disp-formula|.//inline-formula|.//mml:math|.//xref[@ref-type="bibr"]|.//xref[@ref-type="p"]|.//xref[@ref-type="fn"]|.//fn|.//private-char|.//title', namespaces={'mml': 'http://www.w3.org/1998/Math/MathML'})
#: XML stripper that also kills tables and figures
strip_pmc_paragraph_xml = Cleaner(strip_xpath='.//*', kill_xpath='.//disp-formula|.//inline-formula|.//mml:math|.//xref[@ref-type="bibr"]|.//xref[@ref-type="p"]|.//xref[@ref-type="fn"]|.//fn|.//private-char|.//table-wrap|.//fig', namespaces={'mml': 'http://www.w3.org/1998/Math/MathML'})
[docs]def space_labels(document):
"""Ensure space around bold compound labels."""
for label in document.xpath('.//bold'):
# TODO: Make this more permissive to match chemical_label in parser
if not label.text or not re.match('^\(L?\d\d?[a-z]?\):?$', label.text, re.I):
continue
parent = label.getparent()
previous = label.getprevious()
if previous is None:
text = parent.text or ''
if not text.endswith(' '):
parent.text = text + ' '
else:
text = previous.tail or ''
if not text.endswith(' '):
previous.tail = text + ' '
text = label.tail or ''
if not text.endswith(' '):
label.tail = text + ' '
return document
[docs]def tidy_nlm_references(document):
"""Remove punctuation around references like brackets, commas, hyphens."""
def strip_preceding(text):
stext = text.rstrip()
if stext.endswith('[') or stext.endswith('('):
#log.debug('%s -> %s' % (text, stext[:-1]))
return stext[:-1]
return text
def strip_between(text):
stext = text.strip()
if stext in {',', '-', '\u2013', '\u2212'}:
#log.debug('%s -> %s' % (text, ''))
return ''
return text
def strip_following(text):
stext = text.lstrip()
if stext.startswith(']') or stext.startswith(')'):
#log.debug('%s -> %s' % (text, stext[1:]))
return stext[1:]
return text
for ref in document.xpath('.//xref[@ref-type="bibr"]'):
parent = ref.getparent()
previous = ref.getprevious()
next = ref.getnext()
if previous is None:
parent.text = strip_preceding(parent.text or '')
else:
previous.tail = strip_preceding(previous.tail or '')
if next is not None and next.tag == 'xref' and next.get('ref-type') == 'bibr':
ref.tail = strip_between(ref.tail or '')
ref.tail = strip_following(ref.tail or '')
return document
[docs]class NlmXmlAuthor(Entity):
"""Author information from NLM XML file."""
givennames = StringField('./name/given-names/text()', xpath=True)
lastname = StringField('./name/surname/text()', xpath=True)
email = StringField('./email/text()', xpath=True, strip=True)
process_givennames = normalize
process_lastname = normalize
[docs]class NlmXmlImage(Entity):
"""Figure information from NLM XML file."""
label = StringField('./label', xpath=True)
caption = StringField('./caption', xpath=True)
reference = StringField('@id', xpath=True, strip=True)
clean_caption = Chain(tidy_nlm_references, strip_pmc_xml)
process_caption = normalize
[docs]class NlmXmlTable(Entity):
"""Table information from NLM XML file."""
label = StringField('./label', xpath=True)
caption = StringField('./caption', xpath=True)
reference = StringField('@id', xpath=True)
src = StringField('.', xpath=True, strip=True, raw=True)
clean_caption = Chain(tidy_nlm_references, strip_pmc_xml)
process_caption = normalize
[docs]class NlmXmlDocument(Entity):
"""Document information from a NLM XML file."""
# ui = StringField('/art/ui/text()', xpath=True, strip=True)
doi = StringField('/article/front/article-meta/article-id[@pub-id-type="doi"]/text()', xpath=True, lower=True)
pmid = IntField('/article/front/article-meta/article-id[@pub-id-type="pmid"]/text()', xpath=True)
pmcid = IntField('/article/front/article-meta/article-id[@pub-id-type="pmc"]/text()', xpath=True)
title = StringField('/article/front/article-meta//article-title', xpath=True)
authors = EntityField(NlmXmlAuthor, '/article/front/article-meta//contrib[@contrib-type="author"]', xpath=True, all=True)
journal_title = StringField('/article/front/journal-meta//journal-title/text()', xpath=True)
journal_abbreviation = StringField('/article/front/journal-meta/journal-id[@journal-id-type="iso-abbrev"]/text()|/article/front/journal-meta/journal-id[@journal-id-type="nlm-ta"]/text()', xpath=True)
publisher = StringField('/article/front/journal-meta//publisher-name/text()', xpath=True)
volume = StringField('/article/front/article-meta/volume/text()', xpath=True)
firstpage = StringField('/article/front/article-meta/fpage/text()', xpath=True)
lastpage = StringField('/article/front/article-meta/lpage/text()', xpath=True)
issue = StringField('/article/front/article-meta/issue/text()', xpath=True)
issn = StringField('/article/front/journal-meta/issn/text()', xpath=True, all=True)
coden = StringField('/article/front/journal-meta/journal-id[@journal-id-type="coden"]/text()', xpath=True, all=True)
abstract = StringField('/article/front/article-meta/abstract', xpath=True)
online_year = IntField('/article/front/article-meta/pub-date[@pub-type="epub"]/year/text()', xpath=True)
online_month = IntField('/article/front/article-meta/pub-date[@pub-type="epub"]/month/text()', xpath=True)
online_day = IntField('/article/front/article-meta/pub-date[@pub-type="epub"]/day/text()', xpath=True)
published_year = IntField('/article/front/article-meta/pub-date[@pub-type="ppub"]/year/text()', xpath=True)
published_month = IntField('/article/front/article-meta/pub-date[@pub-type="ppub"]/month/text()', xpath=True)
published_day = IntField('/article/front/article-meta/pub-date[@pub-type="ppub"]/day/text()', xpath=True)
accepted_year = IntField('/article/front/article-meta/history/date[@date-type="accepted"]/year/text()', xpath=True)
accepted_month = IntField('/article/front/article-meta/history/date[@date-type="accepted"]/month/text()', xpath=True)
accepted_day = IntField('/article/front/article-meta/history/date[@date-type="accepted"]/day/text()', xpath=True)
received_year = IntField('/article/front/article-meta/history/date[@date-type="received"]/year/text()', xpath=True)
received_month = IntField('/article/front/article-meta/history/date[@date-type="received"]/month/text()', xpath=True)
received_day = IntField('/article/front/article-meta/history/date[@date-type="received"]/day/text()', xpath=True)
license = UrlField('/article/front/article-meta/permissions/license/@xlink:href|/article/front/article-meta/permissions/license//ext-link/@xlink:href', xpath=True)
# figures = EntityField(NlmXmlImage, '/article/body//fig', xpath=True, all=True)
# tables = EntityField(NlmXmlTable, '/article/body//table-wrap', xpath=True, all=True)
# headings = StringField('/article/body//sec/title', xpath=True, all=True)
# paragraphs = StringField('/article/body//sec/p', xpath=True, all=True)
clean_title = strip_pmc_xml
clean_abstract = strip_pmc_abstract_xml
# clean_headings = Chain(tidy_nlm_references, strip_pmc_xml)
# clean_paragraphs = Chain(tidy_nlm_references, strip_pmc_paragraph_xml)
process_title = normalize
process_publisher = normalize
process_abstract = normalize
# process_headings = normalize
# process_paragraphs = Chain(normalize, Discard(''))