# -*- coding: utf-8 -*-
"""
Tools for scraping documents from Springer, Biomed Central and Chemistry Central XML files.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
from ...text.normalize import normalize
from ...text.processors import Chain, LStrip, RStrip, Discard, RAdd
from ..clean import Cleaner
from ..entity import Entity, DocumentEntity
from ..fields import StringField, EntityField, UrlField, IntField
log = logging.getLogger(__name__)
#: XML stripper that also kills equations/formulas.
strip_springer_xml = Cleaner(strip_xpath='.//*', kill_xpath='.//display-formula|.//inline-formula|.//m:math|.//abbrgrp', namespaces={'m': 'http://www.w3.org/1998/Math/MathML'})
#: XML stripper that also kills headings
strip_springer_abstract_xml = Cleaner(strip_xpath='.//*', kill_xpath='.//display-formula|.//inline-formula|.//m:math|.//abbrgrp|.//st', namespaces={'m': 'http://www.w3.org/1998/Math/MathML'})
[docs]def tidy_springer_references(document):
"""Remove punctuation around references like brackets, commas, hyphens."""
def strip_preceding(text):
stext = text.rstrip()
if stext.endswith('[') or stext.endswith('('):
#log.debug('%s -> %s' % (text, stext[:-1]))
return stext[:-1]
return text
def strip_between(text):
stext = text.strip()
if stext in {',', '-', '\u2013'}:
#log.debug('%s -> %s' % (text, ''))
return ''
return text
def strip_following(text):
stext = text.lstrip()
if stext.startswith(']') or stext.startswith(')'):
#log.debug('%s -> %s' % (text, stext[1:]))
return stext[1:]
return text
for ref in document.xpath('.//abbrgrp'):
parent = ref.getparent()
previous = ref.getprevious()
next = ref.getnext()
if previous is None:
parent.text = strip_preceding(parent.text or '')
else:
previous.tail = strip_preceding(previous.tail or '')
if next is not None and next.tag == 'abbrgrp':
ref.tail = strip_between(ref.tail or '')
ref.tail = strip_following(ref.tail or '')
return document
[docs]class SpringerHtmlDocument(DocumentEntity):
""" Scraper for Springer HTML articles """
# TODO: Tables and Figures
title = StringField('meta[property="og:title"]::attr("content")')
abstract = StringField('//section[@class="Abstract"]/p | //div[@class="AbstractSection"]/p ', xpath=True)
journal = StringField('meta[name="citation_journal_title"]::attr("content")')
#copyright = StringField('//div[@class="ArticleCopyright"]', xpath=True)
# headings = StringField('a[href^="#Sec"]::attr("title")', all=True)
#paragraphs = StringField('//div[@class="content"]/p[@class="Para"]', xpath=True, all=True)
#figures = EntityField(ElsevierXmlImage, 'figure', all=True)
#citations = StringField('//li[@class="Citation"]', xpath=True, all=True)
process_html_url = RAdd('.html')
#clean_headings = clean_springer_whitespace
#clean_paragraphs = clean_springer_whitespace
[docs]class SpringerXmlAuthor(Entity):
"""Author information from a Springer XML file."""
firstname = StringField('./fnm', xpath=True, strip=True)
middlename = StringField('./mnm|./mi', xpath=True, strip=True)
lastname = StringField('./snm', xpath=True, strip=True)
suffix = StringField('./suf', xpath=True, strip=True)
email = StringField('./email', xpath=True, strip=True)
process_email = Discard('')
[docs]class SpringerXmlImage(Entity):
"""Figure information from a Springer XML file."""
label = StringField('./title', xpath=True, strip=True)
caption = StringField('./text', xpath=True, strip=True)
reference = StringField('@id', xpath=True, strip=True)
clean_caption = strip_springer_xml
process_caption = normalize
[docs]class SpringerXmlTable(Entity):
"""Table information from a Springer XML file."""
label = StringField('./title', xpath=True, strip=True)
caption = StringField('./caption', xpath=True, strip=True)
reference = StringField('@id', xpath=True, strip=True)
src = StringField('.', xpath=True, strip=True, raw=True)
clean_caption = strip_springer_xml
process_caption = normalize
[docs]class SpringerXmlDocument(Entity):
"""Document information from a Springer XML file."""
ui = StringField('/art/ui/text()', xpath=True, strip=True)
doi = StringField('/art/fm/bibl/xrefbib//pubid[@idtype="doi"]/text()', xpath=True, lower=True)
title = StringField('/art/fm/bibl/title', xpath=True, strip=True)
authors = EntityField(SpringerXmlAuthor, '/art/fm/bibl/aug/au', xpath=True, all=True)
journal = StringField('/art/fm/bibl/source/text()', xpath=True, strip=True)
firstpage = StringField('/art/fm/bibl/fpage/text()', xpath=True, strip=True)
year = IntField('/art/fm/bibl/pubdate/text()', xpath=True)
volume = StringField('/art/fm/bibl/volume/text()', xpath=True, strip=True)
issue = StringField('/art/fm/bibl/issue/text()', xpath=True, strip=True)
issn = StringField('/art/fm/bibl/issn/text()', xpath=True, strip=True)
landing_url = UrlField('/art/fm/bibl/url/text()', xpath=True)
abstract = StringField('/art/fm/abs/sec/p|/art/fm/abs', xpath=True, strip=True)
published_year = IntField('/art/fm/history/pub/date/year/text()', xpath=True)
published_month = IntField('/art/fm/history/pub/date/month/text()', xpath=True)
published_day = IntField('/art/fm/history/pub/date/day/text()', xpath=True)
accepted_year = IntField('/art/fm/history/acc/date/year/text()', xpath=True)
accepted_month = IntField('/art/fm/history/acc/date/month/text()', xpath=True)
accepted_day = IntField('/art/fm/history/acc/date/day/text()', xpath=True)
received_year = IntField('/art/fm/history/rec/date/year/text()', xpath=True)
received_month = IntField('/art/fm/history/rec/date/month/text()', xpath=True)
received_day = IntField('/art/fm/history/rec/date/day/text()', xpath=True)
license = UrlField('/art/fm/cpyrt/note/url/text()', xpath=True, strip=True)
figures = EntityField(SpringerXmlImage, '/art/bdy//fig', xpath=True, all=True)
schemes = EntityField(SpringerXmlImage, '/art/bdy//scheme', xpath=True, all=True)
tables = EntityField(SpringerXmlTable, '/art/bdy//tbl|/art/bdy//table', xpath=True, all=True)
headings = StringField('/art/bdy//st', xpath=True, strip=True, all=True)
paragraphs = StringField('/art/bdy//sec/p', xpath=True, strip=True, all=True)
clean_title = strip_springer_xml
clean_abstract = Chain(tidy_springer_references, strip_springer_abstract_xml)
clean_headings = strip_springer_xml
clean_paragraphs = Chain(tidy_springer_references, strip_springer_xml)
process_abstract = normalize
process_headings = normalize
process_paragraphs = Chain(normalize, Discard(''))
process_license = Chain(LStrip('('), RStrip(')'))