Source code for chemdataextractor.scrape.base

# -*- coding: utf-8 -*-
"""
Abstract base classes that define the interface for Scrapers, Fields, Crawlers, etc.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from abc import ABCMeta, abstractproperty, abstractmethod
import logging

import requests
import six

log = logging.getLogger(__name__)


[docs]class BaseScraper(six.with_metaclass(ABCMeta)): """Abstract Scraper class from which all Scrapers inherit.""" #: CSS selector or XPath expression that returns the root of each entity. root = None #: Whether the root is an XPath expression instead of a CSS selector. root_xpath = False
[docs] def __init__(self): """""" # Create a HTTP session for all requests self.http = self.create_session()
[docs] def create_session(self): """Override to set up default data (e.g. headers, authentication) on each request.""" http = requests.Session() return http
[docs] def name(self): """A unique name for this scraper.""" return ''.join('_%s' % c if c.isupper() else c for c in self.__class__.__name__).strip('_').lower()
@abstractproperty def entity(self): """The Entity to scrape.""" pass
[docs] def process_entity(self, entity): """Override to process each entity.""" return entity
[docs] @abstractmethod def make_request(self, url, data): """Make a HTTP request. :param url: The URL to get. :param data: Query data. :returns: The response to the request. :rtype: requests.Response """ return
[docs] @abstractmethod def process_response(self, response): """Return a Selector for the given response. :param requests.Response response: The response object. :rtype: Selector """ return
[docs] def get_roots(self, selector): """""" if not self.root: yield selector elif self.root_xpath: for root in selector.xpath(self.root): yield root else: for root in selector.css(self.root): yield root
[docs]class BaseFormat(six.with_metaclass(ABCMeta)): """"""
[docs] @abstractmethod def process_response(self, response): """Return a Selector for the given response. :param requests.Response response: The response object. :rtype: Selector """ return
[docs]class BaseRequester(six.with_metaclass(ABCMeta)): """"""
[docs] @abstractmethod def make_request(self, url, data): """Make a HTTP request. :param url: The URL to get. :param data: Query data. :returns: The response to the request. :rtype: requests.Response """ return
[docs]class BaseEntityProcessor(six.with_metaclass(ABCMeta)): """Abstract EntityProcessor class from which all EntityProcessors inherit."""
[docs] @abstractmethod def process_entity(self, entity): """Process an Entity. Return None to filter Entity from the pipeline. :param chemdataextractor.scrape.entity.Entity entity: The Entity to process. :returns: The processed Entity. :rtype: Entity or None """ return entity
[docs]class BaseEntity(six.with_metaclass(ABCMeta)): """Abstract Entity class from which all Entities inherit.""" pass
[docs]class EntityMeta(ABCMeta): """Metaclass for Entity.""" def __new__(mcs, name, bases, attrs): fields = {} for attr_name, attr_value in six.iteritems(attrs): if isinstance(attr_value, BaseField): # Set the name attribute on the field to the attribute name on the Entity attr_value.name = six.text_type(attr_name) fields[attr_name] = attr_value #attrs['fields'] = fields # Set default _meta values, then update with any custom definitions from meta #attrs['_meta'] = {'root': None} #attrs['_meta'].update(attrs.pop('meta', {})) cls = super(EntityMeta, mcs).__new__(mcs, name, bases, attrs) cls.fields = cls.fields.copy() cls.fields.update(fields) return cls
[docs]class BaseField(six.with_metaclass(ABCMeta)): """Base class for all fields.""" # This is assigned by EntityMeta to match the attribute on the Entity name = None
[docs] def __init__(self, selection, xpath=False, re=None, all=False, default=None, null=False, raw=False): """ :param string selection: The CSS selector or XPath expression used to select the content to scrape. :param bool xpath: (Optional) Whether selection is an XPath expression instead of a CSS selector. Default False. :param re: (Optional) Regular expression to apply to scraped content. :param bool all: (Optional) Whether to scrape all occurrences instead of just the first. Default False. :param default: (Optional) The default value for this field if none is set. :param bool null: (Optional) Include in serialized output even if value is None. Default False. :param bool raw: (Optional) Whether to scrape the raw HTML/XML instead of the text contents. Default False. """ self.selection = selection self.xpath = xpath self.re = re self.all = all self.default = default self.null = null self.raw = raw
def __get__(self, instance, owner): """Descriptor for retrieving a value from a field in an Entity.""" # Check if Entity class is being called, rather than Entity instance if instance is None: return self # Get value from Entity instance if available value = instance._values.get(self.name) # If value is None, empty list or empty string return the default value if set if value in [None, [], ''] and self.default is not None: return self.default # Otherwise if value is None and all, return empty list if self.all and value is None: return [] return value def __set__(self, instance, value): """Descriptor for assigning a value to a field in a Entity.""" instance._values[self.name] = value def _post_scrape(self, value, processor=None): """Apply processing to the scraped value.""" # Pass each value through the field's clean method value = [self.process(v) for v in value] # Filter None values value = [v for v in value if v is not None] # Pass each value through processors defined on the entity if processor: value = [processor(v) for v in value] value = [v for v in value if v is not None] # Take first unless all is specified if not self.all: value = value[0] if value else None log.debug('Scraped %s: %s from %s' % (self.name, value, self.selection)) return value
[docs] def scrape(self, selector, cleaner=None, processor=None): """Scrape the value for this field from the selector.""" # Apply CSS or XPath expression to the selector selected = selector.xpath(self.selection) if self.xpath else selector.css(self.selection) # Extract the value and apply regular expression if specified value = selected.re(self.re) if self.re else selected.extract(raw=self.raw, cleaner=cleaner) return self._post_scrape(value, processor=processor)
[docs] def serialize(self, value): """Serialize this field.""" if hasattr(value, 'serialize'): return value.serialize() else: return value
[docs] def process(self, value): """Override to perform custom processing of a value.""" return value