Source code for chemdataextractor.scrape.pub.springer

# -*- coding: utf-8 -*-
"""
Tools for scraping documents from Springer, Biomed Central and Chemistry Central XML files.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging

from ...text.normalize import normalize
from ...text.processors import Chain, LStrip, RStrip, Discard, RAdd
from ..clean import Cleaner
from ..entity import Entity, DocumentEntity
from ..fields import StringField, EntityField, UrlField, IntField

log = logging.getLogger(__name__)


#: XML stripper that also kills equations/formulas.
strip_springer_xml = Cleaner(strip_xpath='.//*', kill_xpath='.//display-formula|.//inline-formula|.//m:math|.//abbrgrp', namespaces={'m': 'http://www.w3.org/1998/Math/MathML'})
#: XML stripper that also kills headings
strip_springer_abstract_xml = Cleaner(strip_xpath='.//*', kill_xpath='.//display-formula|.//inline-formula|.//m:math|.//abbrgrp|.//st', namespaces={'m': 'http://www.w3.org/1998/Math/MathML'})


[docs]def tidy_springer_references(document):
    """Remove punctuation around references like brackets, commas, hyphens."""

    def strip_preceding(text):
        stext = text.rstrip()
        if stext.endswith('[') or stext.endswith('('):
            #log.debug('%s -> %s' % (text, stext[:-1]))
            return stext[:-1]
        return text

    def strip_between(text):
        stext = text.strip()
        if stext in {',', '-', '\u2013'}:
            #log.debug('%s -> %s' % (text, ''))
            return ''
        return text

    def strip_following(text):
        stext = text.lstrip()
        if stext.startswith(']') or stext.startswith(')'):
            #log.debug('%s -> %s' % (text, stext[1:]))
            return stext[1:]
        return text

    for ref in document.xpath('.//abbrgrp'):
        parent = ref.getparent()
        previous = ref.getprevious()
        next = ref.getnext()
        if previous is None:
            parent.text = strip_preceding(parent.text or '')
        else:
            previous.tail = strip_preceding(previous.tail or '')
        if next is not None and next.tag == 'abbrgrp':
            ref.tail = strip_between(ref.tail or '')
        ref.tail = strip_following(ref.tail or '')
    return document


[docs]class SpringerHtmlDocument(DocumentEntity):
    """ Scraper for Springer HTML articles """
    # TODO: Tables and Figures
    title = StringField('meta[property="og:title"]::attr("content")')
    abstract = StringField('//section[@class="Abstract"]/p | //div[@class="AbstractSection"]/p ', xpath=True)
    journal = StringField('meta[name="citation_journal_title"]::attr("content")')
    #copyright = StringField('//div[@class="ArticleCopyright"]', xpath=True)
    # headings = StringField('a[href^="#Sec"]::attr("title")', all=True)
    #paragraphs = StringField('//div[@class="content"]/p[@class="Para"]', xpath=True, all=True)
    #figures = EntityField(ElsevierXmlImage, 'figure', all=True)
    #citations = StringField('//li[@class="Citation"]', xpath=True, all=True)

    process_html_url = RAdd('.html')
    #clean_headings = clean_springer_whitespace
    #clean_paragraphs = clean_springer_whitespace


[docs]class SpringerXmlAuthor(Entity):
    """Author information from a Springer XML file."""
    firstname = StringField('./fnm', xpath=True, strip=True)
    middlename = StringField('./mnm|./mi', xpath=True, strip=True)
    lastname = StringField('./snm', xpath=True, strip=True)
    suffix = StringField('./suf', xpath=True, strip=True)
    email = StringField('./email', xpath=True, strip=True)

    process_email = Discard('')


[docs]class SpringerXmlImage(Entity):
    """Figure information from a Springer XML file."""
    label = StringField('./title', xpath=True, strip=True)
    caption = StringField('./text', xpath=True, strip=True)
    reference = StringField('@id', xpath=True, strip=True)

    clean_caption = strip_springer_xml

    process_caption = normalize


[docs]class SpringerXmlTable(Entity):
    """Table information from a Springer XML file."""
    label = StringField('./title', xpath=True, strip=True)
    caption = StringField('./caption', xpath=True, strip=True)
    reference = StringField('@id', xpath=True, strip=True)
    src = StringField('.', xpath=True, strip=True, raw=True)

    clean_caption = strip_springer_xml

    process_caption = normalize


[docs]class SpringerXmlDocument(Entity):
    """Document information from a Springer XML file."""
    ui = StringField('/art/ui/text()', xpath=True, strip=True)
    doi = StringField('/art/fm/bibl/xrefbib//pubid[@idtype="doi"]/text()', xpath=True, lower=True)
    title = StringField('/art/fm/bibl/title', xpath=True, strip=True)
    authors = EntityField(SpringerXmlAuthor, '/art/fm/bibl/aug/au', xpath=True, all=True)
    journal = StringField('/art/fm/bibl/source/text()', xpath=True, strip=True)
    firstpage = StringField('/art/fm/bibl/fpage/text()', xpath=True, strip=True)
    year = IntField('/art/fm/bibl/pubdate/text()', xpath=True)
    volume = StringField('/art/fm/bibl/volume/text()', xpath=True, strip=True)
    issue = StringField('/art/fm/bibl/issue/text()', xpath=True, strip=True)
    issn = StringField('/art/fm/bibl/issn/text()', xpath=True, strip=True)
    landing_url = UrlField('/art/fm/bibl/url/text()', xpath=True)
    abstract = StringField('/art/fm/abs/sec/p|/art/fm/abs', xpath=True, strip=True)
    published_year = IntField('/art/fm/history/pub/date/year/text()', xpath=True)
    published_month = IntField('/art/fm/history/pub/date/month/text()', xpath=True)
    published_day = IntField('/art/fm/history/pub/date/day/text()', xpath=True)
    accepted_year = IntField('/art/fm/history/acc/date/year/text()', xpath=True)
    accepted_month = IntField('/art/fm/history/acc/date/month/text()', xpath=True)
    accepted_day = IntField('/art/fm/history/acc/date/day/text()', xpath=True)
    received_year = IntField('/art/fm/history/rec/date/year/text()', xpath=True)
    received_month = IntField('/art/fm/history/rec/date/month/text()', xpath=True)
    received_day = IntField('/art/fm/history/rec/date/day/text()', xpath=True)
    license = UrlField('/art/fm/cpyrt/note/url/text()', xpath=True, strip=True)
    figures = EntityField(SpringerXmlImage, '/art/bdy//fig', xpath=True, all=True)
    schemes = EntityField(SpringerXmlImage, '/art/bdy//scheme', xpath=True, all=True)
    tables = EntityField(SpringerXmlTable, '/art/bdy//tbl|/art/bdy//table', xpath=True, all=True)
    headings = StringField('/art/bdy//st', xpath=True, strip=True, all=True)
    paragraphs = StringField('/art/bdy//sec/p', xpath=True, strip=True, all=True)

    clean_title = strip_springer_xml
    clean_abstract = Chain(tidy_springer_references, strip_springer_abstract_xml)
    clean_headings = strip_springer_xml
    clean_paragraphs = Chain(tidy_springer_references, strip_springer_xml)

    process_abstract = normalize
    process_headings = normalize
    process_paragraphs = Chain(normalize, Discard(''))
    process_license = Chain(LStrip('('), RStrip(')'))