# -*- coding: utf-8 -*-
"""
Abstract base classes that define the interface for Scrapers, Fields, Crawlers, etc.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from abc import ABCMeta, abstractproperty, abstractmethod
import logging
import requests
import six
log = logging.getLogger(__name__)
[docs]class BaseScraper(six.with_metaclass(ABCMeta)):
"""Abstract Scraper class from which all Scrapers inherit."""
#: CSS selector or XPath expression that returns the root of each entity.
root = None
#: Whether the root is an XPath expression instead of a CSS selector.
root_xpath = False
[docs] def __init__(self):
""""""
# Create a HTTP session for all requests
self.http = self.create_session()
[docs] def create_session(self):
"""Override to set up default data (e.g. headers, authentication) on each request."""
http = requests.Session()
return http
[docs] def name(self):
"""A unique name for this scraper."""
return ''.join('_%s' % c if c.isupper() else c for c in self.__class__.__name__).strip('_').lower()
@abstractproperty
def entity(self):
"""The Entity to scrape."""
pass
[docs] def process_entity(self, entity):
"""Override to process each entity."""
return entity
[docs] @abstractmethod
def make_request(self, url, data):
"""Make a HTTP request.
:param url: The URL to get.
:param data: Query data.
:returns: The response to the request.
:rtype: requests.Response
"""
return
[docs] @abstractmethod
def process_response(self, response):
"""Return a Selector for the given response.
:param requests.Response response: The response object.
:rtype: Selector
"""
return
[docs] def get_roots(self, selector):
""""""
if not self.root:
yield selector
elif self.root_xpath:
for root in selector.xpath(self.root):
yield root
else:
for root in selector.css(self.root):
yield root
[docs]class BaseRequester(six.with_metaclass(ABCMeta)):
""""""
[docs] @abstractmethod
def make_request(self, url, data):
"""Make a HTTP request.
:param url: The URL to get.
:param data: Query data.
:returns: The response to the request.
:rtype: requests.Response
"""
return
[docs]class BaseEntityProcessor(six.with_metaclass(ABCMeta)):
"""Abstract EntityProcessor class from which all EntityProcessors inherit."""
[docs] @abstractmethod
def process_entity(self, entity):
"""Process an Entity. Return None to filter Entity from the pipeline.
:param chemdataextractor.scrape.entity.Entity entity: The Entity to process.
:returns: The processed Entity.
:rtype: Entity or None
"""
return entity
[docs]class BaseEntity(six.with_metaclass(ABCMeta)):
"""Abstract Entity class from which all Entities inherit."""
pass
[docs]class BaseField(six.with_metaclass(ABCMeta)):
"""Base class for all fields."""
# This is assigned by EntityMeta to match the attribute on the Entity
name = None
[docs] def __init__(self, selection, xpath=False, re=None, all=False, default=None, null=False, raw=False):
"""
:param string selection: The CSS selector or XPath expression used to select the content to scrape.
:param bool xpath: (Optional) Whether selection is an XPath expression instead of a CSS selector. Default False.
:param re: (Optional) Regular expression to apply to scraped content.
:param bool all: (Optional) Whether to scrape all occurrences instead of just the first. Default False.
:param default: (Optional) The default value for this field if none is set.
:param bool null: (Optional) Include in serialized output even if value is None. Default False.
:param bool raw: (Optional) Whether to scrape the raw HTML/XML instead of the text contents. Default False.
"""
self.selection = selection
self.xpath = xpath
self.re = re
self.all = all
self.default = default
self.null = null
self.raw = raw
def __get__(self, instance, owner):
"""Descriptor for retrieving a value from a field in an Entity."""
# Check if Entity class is being called, rather than Entity instance
if instance is None:
return self
# Get value from Entity instance if available
value = instance._values.get(self.name)
# If value is None, empty list or empty string return the default value if set
if value in [None, [], ''] and self.default is not None:
return self.default
# Otherwise if value is None and all, return empty list
if self.all and value is None:
return []
return value
def __set__(self, instance, value):
"""Descriptor for assigning a value to a field in a Entity."""
instance._values[self.name] = value
def _post_scrape(self, value, processor=None):
"""Apply processing to the scraped value."""
# Pass each value through the field's clean method
value = [self.process(v) for v in value]
# Filter None values
value = [v for v in value if v is not None]
# Pass each value through processors defined on the entity
if processor:
value = [processor(v) for v in value]
value = [v for v in value if v is not None]
# Take first unless all is specified
if not self.all:
value = value[0] if value else None
log.debug('Scraped %s: %s from %s' % (self.name, value, self.selection))
return value
[docs] def scrape(self, selector, cleaner=None, processor=None):
"""Scrape the value for this field from the selector."""
# Apply CSS or XPath expression to the selector
selected = selector.xpath(self.selection) if self.xpath else selector.css(self.selection)
# Extract the value and apply regular expression if specified
value = selected.re(self.re) if self.re else selected.extract(raw=self.raw, cleaner=cleaner)
return self._post_scrape(value, processor=processor)
[docs] def serialize(self, value):
"""Serialize this field."""
if hasattr(value, 'serialize'):
return value.serialize()
else:
return value
[docs] def process(self, value):
"""Override to perform custom processing of a value."""
return value