# -*- coding: utf-8 -*-
"""
Readers for documents from Springer.
.. codeauthor:: Callum Court
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from lxml import etree
import six
from lxml.html import HTMLParser
from ..text import get_encoding
from .markup import HtmlReader, XmlReader
from ..scrape.clean import clean, Cleaner, strip_html
from ..scrape.pub.springer import tidy_springer_references
clean_springer_html = Cleaner(fix_whitespace=True, strip_xpath='.//sub | .//em | .//strong')
[docs]class SpringerMaterialsHtmlReader(HtmlReader):
"""Reader for HTML documents from SpringerMaterials."""
cleaners = [clean, clean_springer_html]
root_css = 'html'
citation_css = 'span[class="CitationRef"]'
title_css = 'title'
heading_css = 'h2, h3, h4, h5, h6, .title1, span.title2, span.title3'
table_css = 'div[class="Table"]'
table_caption_css = 'div[class="Table"] p'
table_head_row_css = 'thead'
table_body_row_css = 'tbody'
table_cell_css = 'th, td'
ignore_css = 'sub, sup, em[class^="EmphasisTypeItalic "], li[class="article-metrics__item"], div[class="CitationContent"]'
[docs] def detect(self, fstring, fname=None):
""""""
if fname and not (fname.endswith('.html') or fname.endswith('.htm')):
return False
if b'<a class="footer-copyright_link" href="http://www.springernature.com"' in fstring or b'<meta content="SpringerLink"' in fstring:
return True
return False
def _make_tree(self, fstring):
root = etree.fromstring(fstring, parser=HTMLParser(
encoding=get_encoding(fstring, guesses='utf-8', is_html=True)))
return root
[docs]def springer_html_whitespace(document):
""" Remove whitespace in xml.text or xml.tails for all elements, if it is only whitespace """
# selects all tags and checks if the text or tail are spaces
for el in document.xpath('//*'):
if str(el.text).isspace():
el.text = ''
if str(el.tail).isspace():
el.tail = ''
# debug, check the document
#print(etree.tostring(document, pretty_print=True))
# sys.exit()
return document
[docs]def fix_springer_table_whitespace(document):
"""remove leading and trailing whitespace from table cells
Arguments:
document {[type]} -- [description]
Returns:
[type] -- [description]
"""
for el in document.xpath('.//table//p | .//table//p'):
if el.text:
stripped = str(el.text).strip()
el.text = stripped
return document
[docs]class SpringerHtmlReader(HtmlReader):
cleaners = [clean, springer_html_whitespace, clean_springer_html, strip_html, tidy_springer_references, fix_springer_table_whitespace]
root_css = 'html'
title_css = 'h1[class^="ArticleTitle"]'
heading_css = 'h2, h3, h4'
table_css = 'div[class="Table"]'
table_caption_css = 'div[class^="Caption"] p'
table_head_row_css = 'thead tr'
table_body_row_css = 'tbody tr'
table_cell_css = 'td, th'
figure_css = 'figure'
figure_caption_css = 'figcaption'
figure_label_css = 'figcaption span[class^="CaptionNumber"]'
# citation_css = 'ce|bib-reference'
ignore_css = 'a[class="skip-to__link pseudo-focus"], div[class="nojs-banner u-interface"], a[class="skip-to__link skip-to__link--contents pseudo-focus"],\
p[class="leaderboard__label"], div[class="u-screenreader-only"], label[for="search-springerlink"], span[class="search-button__title"],\
span[class="u-overflow-ellipsis"], span[class="u-overflow-ellipsis"], a[class="c-button c-button--blue c-button__icon-right gtm-pdf-link"],\
div[class="leaderboard u-hide"], title, li[class="article-metrics__item"], aside[class="section section--collapsible"], a[class="gtm-cite-link"],\
span[class="u-screenreader-only"], div[class="authors__list"], a[class="gtm-tab-authorsandaffiliations"], ol[class="BibliographyWrapper"],\
h2[id="copyrightInformation"], div[class="content authors-affiliations u-interface"], p[class="footer__copyright"], p[class="footer__user-access-info"],\
span[class="u-screenreader-only"], a[href="/contactus"], a[class="gtm-footer-accessibility"], ul[class="footer__nav"], div[class="footer__aside-wrapper"],\
aside[class="main-sidebar-right u-interface"], a[class="c-button share-this gtm-shareby-sharelink-link test-shareby-sharelink-link"],\
a[class="gtm-export-citation"], ul[class="citations__content"], h3[data-role="button-dropdown__title"],\
div[class="section section--collapsible uptodate-recommendations gtm-recommendations"], span[class="InlineEquation"], div[class="EquationContent"],\
div[class="EquationNumber"], footer'
[docs] def detect(self, fstring, fname=None):
""""""
if fname and not (fname.endswith('.html') or fname.endswith('.htm')):
return False
if b'<meta content="Springer US" name="citation_publisher"' in fstring or b'<meta content="SpringerLink"' in fstring:
print("springer HTML")
return True
return False
def _make_tree(self, fstring):
root = etree.fromstring(fstring, parser=HTMLParser(
encoding=get_encoding(fstring, guesses='utf-8', is_html=True)))
return root