Source code for chemdataextractor.scrape.entity

# -*- coding: utf-8 -*-
"""
An entity to extract.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from collections import Sequence
import json
import logging

import six

from .base import BaseEntity, EntityMeta
from .fields import StringField, DateTimeField, UrlField
from ..text.normalize import normalize


log = logging.getLogger(__name__)


[docs]class Entity(six.with_metaclass(EntityMeta, BaseEntity)):

    fields = {}

[docs]    def __init__(self, selector):
        """

        :param Selector selector: The selector to scrape.
        """
        self._values = {}
        # Iterate all defined fields
        for field_name, field in six.iteritems(self.fields):
            # Scrape field values from selector
            cleaner = getattr(self, 'clean_%s' % field_name, None)
            processor = getattr(self, 'process_%s' % field_name, None)
            value = field.scrape(selector, cleaner=cleaner, processor=processor)
            # Finalize value using finalize_* method on scrape, if it exists
            if hasattr(self, 'finalize_%s' % field_name):
                value = getattr(self, 'finalize_%s' % field_name)(value)
            log.debug('Assigning %s: %s' % (field_name, value))
            setattr(self, field_name, value)

    def __eq__(self, other):
        if isinstance(other, self.__class__):
            return self._values == other._values
        return False

    def __len__(self):
        return len(self._values)

    def __iter__(self):
        return iter(self._values)

    def __delattr__(self, attr):
        """Handle deletion of field values by setting to default if specified."""
        # Set to default value
        if attr in self.fields:
            setattr(self, attr, self.fields[attr].default)
        else:
            super(Entity, self).__delattr__(attr)

    def __getitem__(self, key):
        """Redirect dictionary-style field access to attribute-style."""
        try:
            if key in self.fields:
                return getattr(self, key)
        except AttributeError:
            pass
        raise KeyError(key)

    def __setitem__(self, key, value):
        """Redirect dictionary-style field setting to attribute-style."""
        if key not in self.fields:
            raise KeyError(key)
        return setattr(self, key, value)

    def __contains__(self, name):
        try:
            val = getattr(self, name)
            return val is not None
        except AttributeError:
            return False

    def __repr__(self):
        return '%s()' % self.__class__.__name__

[docs]    @classmethod
    def scrape(cls, selector, root, xpath=False):
        """Return EntityList for the given selector."""
        log.debug('Called scrape classmethod with root: %s' % root)
        roots = selector.xpath(root) if xpath else selector.css(root)
        results = [cls(r) for r in roots]
        return EntityList(*results)

[docs]    def serialize(self):
        """Convert Entity to python dictionary."""
        # Serialize fields to a dict
        data = {}
        for field_name in self:
            value = self._values.get(field_name)
            field = self.fields.get(field_name)
            if value is not None:
                if field.all:
                    value = [field.serialize(v) for v in value]
                else:
                    value = field.serialize(value)
            # Skip empty fields unless field.null
            if not field.null and ((field.all and value == []) or (not field.all and value in {None, ''})):
                continue
            data[field.name] = value
        return data

[docs]    def to_json(self, *args, **kwargs):
        """Convert Entity to JSON."""
        return json.dumps(self.serialize(), *args, **kwargs)


[docs]class EntityList(Sequence):
    """Wrapper around a list of Entities to facilitate operations on all at once."""

[docs]    def __init__(self, *entities):
        self.entities = list(entities)

    def __getitem__(self, index):
        return self.entities[index]

    def __len__(self):
        return len(self.entities)

[docs]    def serialize(self):
        """Serialize to a list of python dictionaries."""
        return [e.serialize() for e in self.entities]

[docs]    def to_json(self, *args, **kwargs):
        """Convert EntityList to JSON."""
        return json.dumps(self.serialize(), *args, **kwargs)


[docs]class DocumentEntity(Entity):
    """Generic document entity."""
    doi = StringField('//meta[@name="citation_doi"]/@content | //meta[@name="dc.identifier"]/@content | //meta[@name="DC.identifier"]/@content | //meta[@name="DC.Identifier"]/@content | //meta[@name="dc.Identifier"]/@content', xpath=True, lower=True)
    title = StringField('//meta[@name="citation_title"]/@content | //meta[@name="dc.title"]/@content | //meta[@name="DC.title"]/@content | //meta[@name="DC.Title"]/@content | //meta[@name="dc.Title"]/@content | //meta[@name="title"]/@content', xpath=True, strip=True)
    authors = StringField('//meta[@name="citation_author"]/@content | //meta[@name="dc.creator"]/@content | //meta[@name="DC.creator"]/@content | //meta[@name="DC.Creator"]/@content | //meta[@name="dc.Creator"]/@content', xpath=True, all=True)
    published_date = DateTimeField('//meta[@name="citation_publication_date"]/@content | //meta[@name="prism.publicationDate"]/@content | //meta[@name="citation_date"]/@content | //meta[@name="dc.date"]/@content | //meta[@name="DC.date"]/@content | //meta[@name="DC.Date"]/@content | //meta[@name="dc.Date"]/@content', xpath=True)
    online_date = DateTimeField('//meta[@name="citation_online_date"]/@content', xpath=True)
    journal = StringField('//meta[@name="citation_journal_title"]/@content | //meta[@name="citation_journal_abbrev"]/@content | //meta[@name="prism.publicationName"]/@content | //meta[@name="dc.source"]/@content | //meta[@name="DC.source"]/@content | //meta[@name="DC.Source"]/@content', xpath=True, strip=True)
    volume = StringField('//meta[@name="citation_volume"]/@content | //meta[@name="prism.volume"]/@content', xpath=True)
    issue = StringField('//meta[@name="citation_issue"]/@content | //meta[@name="prism.number"]/@content | //meta[@name="citation_technical_report_number"]/@content', xpath=True)
    firstpage = StringField('//meta[@name="citation_firstpage"]/@content | //meta[@name="prism.startingPage"]/@content', xpath=True)
    lastpage = StringField('//meta[@name="citation_lastpage"]/@content', xpath=True)
    abstract = StringField('//meta[@name="citation_abstract"]/@content', xpath=True, strip=True)
    publisher = StringField('//meta[@name="citation_publisher"]/@content | //meta[@name="dc.publisher"]/@content | //meta[@name="DC.publisher"]/@content | //meta[@name="dc.Publisher"]/@content | //meta[@name="DC.Publisher"]/@content', xpath=True)
    issn = StringField('//meta[@name="citation_issn"]/@content | //meta[@name="prism.issn"]/@content', xpath=True)
    language = StringField('//meta[@name="citation_language"]/@content | //meta[@name="dc.language"]/@content | //meta[@name="DC.language"] | //meta[@name="DC.Language"]/@content', xpath=True)
    copyright = StringField('//meta[@name="dc.copyright"]/@content | //meta[@name="DC.copyright"]/@content | //meta[@name="DC.Copyright"]/@content | //meta[@name="prism.copyright"]/@content', xpath=True)
    license = UrlField('//a[@rel="license"]/@href', xpath=True)
    html_url = UrlField('//meta[@name="citation_fulltext_html_url"]/@content', xpath=True)
    pdf_url = UrlField('//meta[@name="citation_pdf_url"]/@content', xpath=True)
    landing_url = UrlField('//meta[@name="citation_abstract_html_url"]/@content', xpath=True)

    process_title = normalize
    process_journal = normalize
    process_publisher = normalize
    process_authors = normalize
    process_abstract = normalize

    # TODO: Abbreviations: <abbr title="Australia">AU</abbr>