Source code for chemdataextractor.reader.elsevier

# -*- coding: utf-8 -*-
"""
Elsevier XML reader

.. codeauthor:: Callum Court <[email protected]>


Readers for Elsevier XML files.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import six
from ..scrape.clean import clean, Cleaner
from ..doc.table import Cell, Table
from ..doc.text import Caption
from ..doc.meta import MetaData
from .markup import XmlReader
from lxml import etree
import re

[docs]def remove_if_reference(el): text = el.text check_regex = re.compile('\[\d') if check_regex.match(text): return None return el
# XML stripper that removes the tags around numbers in chemical formulas strip_els_xml = Cleaner(strip_xpath='.//ce:inf | .//ce:italic | .//ce:bold | .//ce:formula | .//mml:* | .//ce:sup | .//ce:table//ce:sup', kill_xpath='.//ce:cross-ref//ce:sup | .//ce:note-para | .//ce:cross-ref | .//ce:cross-refs', process_xpaths={'.//ce:cross-ref//ce:sup | .//ce:cross-ref | .//ce:cross-refs': remove_if_reference})
[docs]def fix_elsevier_xml_whitespace(document): """ Fix tricky xml tags""" # space hsp and refs correctly for el in document.xpath('.//ce:hsp'): parent = el.getparent() previous = el.getprevious() if parent is None: continue # Append the text to previous tail (or parent text if no previous), ensuring newline if block level if el.text and isinstance(el.tag, six.string_types): if previous is None: if parent.text: if parent.text.endswith(' '): parent.text = (parent.text or '') + '' + el.text else: parent.text = (parent.text or '') + ' ' + el.text else: if previous.tail: if previous.tail.endswith(' '): previous.tail = (previous.tail or '') + '' + el.text else: previous.tail = (previous.tail or '') + ' ' + el.text # Append the tail to last child tail, or previous tail, or parent text, ensuring newline if block level if el.tail: if len(el): last = el[-1] last.tail = (last.tail or '') + el.tail elif previous is None: if el.tail.startswith(' '): parent.text = (parent.text or '') + '' + el.tail else: parent.text = (parent.text or '') + ' ' + el.tail else: if el.tail.startswith(' '): previous.tail = (previous.tail or '') + '' + el.tail else: previous.tail = (previous.tail or '') + ' ' + el.tail index = parent.index(el) parent[index:index + 1] = el[:] return document
[docs]def els_xml_whitespace(document): """ Remove whitespace in xml.text or xml.tails for all elements, if it is only whitespace """ # selects all tags and checks if the text or tail are spaces for el in document.xpath('//*'): if str(el.text).isspace(): el.text = '' if str(el.tail).isspace(): el.tail = '' # debug, check the document #print(etree.tostring(document, pretty_print=True)) # sys.exit() return document
[docs]class ElsevierXmlReader(XmlReader): """Reader for Elsevier XML documents.""" cleaners = [clean, fix_elsevier_xml_whitespace, els_xml_whitespace, strip_els_xml] etree.FunctionNamespace("http://www.elsevier.com/xml/svapi/article/dtd").prefix = 'default' etree.FunctionNamespace("http://www.elsevier.com/xml/bk/dtd").prefix = 'bk' etree.FunctionNamespace("http://www.elsevier.com/xml/common/cals/dtd").prefix = 'cals' etree.FunctionNamespace("http://www.elsevier.com/xml/common/dtd").prefix = 'ce' etree.FunctionNamespace("http://www.elsevier.com/xml/ja/dtd").prefix = 'ja' etree.FunctionNamespace("http://www.w3.org/1998/Math/MathML").prefix = 'mml' etree.FunctionNamespace("http://www.elsevier.com/xml/common/struct-aff/dtd").prefix = 'sa' etree.FunctionNamespace("http://www.elsevier.com/xml/common/struct-bib/dtd").prefix = 'sb' etree.FunctionNamespace("http://www.elsevier.com/xml/common/table/dtd").prefix = 'tb' etree.FunctionNamespace("http://www.w3.org/1999/xlink").prefix = 'xlink' etree.FunctionNamespace("http://www.elsevier.com/xml/xocs/dtd").prefix = 'xocs' etree.FunctionNamespace("http://purl.org/dc/elements/1.1/").prefix = 'dc' etree.FunctionNamespace("http://purl.org/dc/terms/").prefix = 'dcterms' etree.FunctionNamespace("http://prismstandard.org/namespaces/basic/2.0/").prefix = 'prism' etree.FunctionNamespace("http://www.w3.org/2001/XMLSchema-instance").prefix = 'xsi' root_css = 'default|full-text-retrieval-response' title_css = 'dc|title' heading_css = 'ce|section-title' table_css = 'ce|table' table_caption_css = 'ce|table ce|caption' table_head_row_css = 'cals|thead cals|row' table_body_row_css = 'cals|tbody cals|row' table_cell_css = 'ce|entry' table_footnote_css = 'table-wrap-foot p' figure_css = 'ce|figure' figure_caption_css = 'ce|figure ce|caption' figure_label_css = 'ce|figure ce|label' figure_download_link_css = '' reference_css = 'ce|cross-ref, ce|cross-refs' citation_css = 'ce|bib-reference' metadata_css = 'xocs|meta' metadata_title_css = 'xocs|normalized-article-title' metadata_author_css = 'xocs|normalized-first-auth-surname' metadata_journal_css = 'xocs|srctitle' metadata_volume_css = 'xocs|vol-first, xocs|volume-list xocs|volume' metadata_issue_css = 'xocs|issns xocs|issn-primary-formatted' metadata_publisher_css = 'xocs|copyright-line' metadata_date_css = 'xocs|available-online-date, xocs|orig-load-date' metadata_firstpage_css = 'xocs|first-fp' metadata_lastpage_css = 'xocs|last-lp' metadata_doi_css = 'xocs|doi, xocs|eii' metadata_pii_css = 'xocs|pii-unformatted' # ce|cross-ref may need to return ignore_css = 'ce|bibliography, ce|acknowledgment, ce|correspondence, ce|author, ce|doi, ja|jid, ja|aid, ce|pii, xocs|oa-sponsor-type, xocs|open-access, default|openaccess,'\ 'default|openaccessArticle, dc|format, dc|creator, dc|identifier,'\ 'default|eid, default|pii, xocs|meta, xocs|ref-info, default|scopus-eid,'\ 'xocs|normalized-srctitle,' \ 'xocs|eid, xocs|hub-eid, xocs|normalized-first-auth-surname,' \ 'xocs|normalized-first-auth-initial, xocs|refkeys,' \ 'xocs|attachment-eid, xocs|attachment-type,' \ 'ja|jid, ce|given-name, ce|surname, ce|affiliation,' \ 'ce|grant-sponsor, ce|grant-number, prism|copyright,' \ 'xocs|pii-unformatted, xocs|ucs-locator, ce|copyright,' \ 'prism|publisher, prism|*, xocs|copyright-line, xocs|cp-notice,' \ 'dc|description, xocs|document-subtype, ce|keywords, default|openaccessType,'\ 'default|openArchiveArticle, default|openaccessSponsorName, default|openaccessSponsorType, default|openaccessUserLicense, dcterms|subject,'\ 'ce|dochead, ce|label, default|pubType' url_prefix = 'https://sciencedirect.com/science/article/pii/'
[docs] def detect(self, fstring, fname=None): """Elsevier document detection based on string found in xml""" if fname and not fname.endswith('.xml'): return False if b'xmlns="http://www.elsevier.com/xml/svapi/article/dtd"' in fstring: return True return False
def _parse_metadata(self, el, refs, specials): title = self._css(self.metadata_title_css, el) authors = self._css(self.metadata_author_css,el) publisher = self._css(self.metadata_publisher_css,el) journal = self._css(self.metadata_journal_css,el) date = self._css(self.metadata_date_css,el) language = self._css(self.metadata_language_css,el) volume = self._css(self.metadata_volume_css,el) issue = self._css(self.metadata_issue_css,el) firstpage =self._css(self.metadata_firstpage_css,el) lastpage=self._css(self.metadata_lastpage_css,el) doi = self._css(self.metadata_doi_css,el) pii = self._css(self.metadata_pii_css, el) pdf_url = self._css(self.metadata_pdf_url_css,el) html_url = self._css(self.metadata_html_url_css,el) metadata = { '_title': title[0].text if title else None, '_authors': [i.text for i in authors] if authors else None, '_publisher': publisher[0].text if publisher else None, '_journal': journal[0].text if journal else None, '_date': date[0].text if date else None, '_language': language[0].text if language else None, '_volume': volume[0].text if volume else None, '_issue': issue[0].text if issue else None, '_firstpage': firstpage[0].text if firstpage else None, '_lastpage': lastpage[0].text if lastpage else None, '_doi': doi[0].text if doi else None, '_pdf_url': self.url_prefix + pdf_url[0].text if pdf_url else None, '_html_url': self.url_prefix + html_url[0].text if html_url else self.url_prefix + pii[0].text } meta = MetaData(metadata) return [meta] def _parse_table_rows(self, els, refs, specials): hdict = {} for row, tr in enumerate(els): colnum = 0 for td in self._css(self.table_cell_css, tr): cell = self._parse_text(td, refs=refs, specials=specials, element_cls=Cell) namest = int([i for i in td.get('namest', '1').split('col') if i][0]) nameend = int([i for i in td.get('nameend', '1').split('col') if i][0]) colspan = (nameend - namest) + 1 rowspan = int(td.get('morerows', '0')) + 1 for i in range(colspan): for j in range(rowspan): rownum = row + j if not rownum in hdict: hdict[rownum] = {} while colnum in hdict[rownum]: colnum += 1 hdict[rownum][colnum] = cell[0] colnum += 1 rows = [] for row in sorted(hdict): rows.append([]) for col in sorted(hdict[row]): rows[-1].append(hdict[row][col]) for r in rows: r.extend([Cell('')] * (len(max(rows, key=len)) - len(r))) rows = [r for r in rows if any(r)] return rows def _parse_figure_links(self, el): """Parse awkward elsevier figure links """ figure_link_css = self._css('ce|link', el) figure_link_locator = figure_link_css[0].get('locator', '') if figure_link_css else None links = [] # find the locator id in the objects objects = self._css('default|object', self.root) for obj in objects: ref = obj.get('ref', '0') if ref == figure_link_locator: links.append(obj.text) return links