Source code for chemdataextractor.scrape.entity

# -*- coding: utf-8 -*-
"""
An entity to extract.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from collections import Sequence
import json
import logging

import six

from .base import BaseEntity, EntityMeta
from .fields import StringField, DateTimeField, UrlField
from ..text.normalize import normalize


log = logging.getLogger(__name__)


[docs]class Entity(six.with_metaclass(EntityMeta, BaseEntity)): fields = {}
[docs] def __init__(self, selector): """ :param Selector selector: The selector to scrape. """ self._values = {} # Iterate all defined fields for field_name, field in six.iteritems(self.fields): # Scrape field values from selector cleaner = getattr(self, 'clean_%s' % field_name, None) processor = getattr(self, 'process_%s' % field_name, None) value = field.scrape(selector, cleaner=cleaner, processor=processor) # Finalize value using finalize_* method on scrape, if it exists if hasattr(self, 'finalize_%s' % field_name): value = getattr(self, 'finalize_%s' % field_name)(value) log.debug('Assigning %s: %s' % (field_name, value)) setattr(self, field_name, value)
def __eq__(self, other): if isinstance(other, self.__class__): return self._values == other._values return False def __len__(self): return len(self._values) def __iter__(self): return iter(self._values) def __delattr__(self, attr): """Handle deletion of field values by setting to default if specified.""" # Set to default value if attr in self.fields: setattr(self, attr, self.fields[attr].default) else: super(Entity, self).__delattr__(attr) def __getitem__(self, key): """Redirect dictionary-style field access to attribute-style.""" try: if key in self.fields: return getattr(self, key) except AttributeError: pass raise KeyError(key) def __setitem__(self, key, value): """Redirect dictionary-style field setting to attribute-style.""" if key not in self.fields: raise KeyError(key) return setattr(self, key, value) def __contains__(self, name): try: val = getattr(self, name) return val is not None except AttributeError: return False def __repr__(self): return '%s()' % self.__class__.__name__
[docs] @classmethod def scrape(cls, selector, root, xpath=False): """Return EntityList for the given selector.""" log.debug('Called scrape classmethod with root: %s' % root) roots = selector.xpath(root) if xpath else selector.css(root) results = [cls(r) for r in roots] return EntityList(*results)
[docs] def serialize(self): """Convert Entity to python dictionary.""" # Serialize fields to a dict data = {} for field_name in self: value = self._values.get(field_name) field = self.fields.get(field_name) if value is not None: if field.all: value = [field.serialize(v) for v in value] else: value = field.serialize(value) # Skip empty fields unless field.null if not field.null and ((field.all and value == []) or (not field.all and value in {None, ''})): continue data[field.name] = value return data
[docs] def to_json(self, *args, **kwargs): """Convert Entity to JSON.""" return json.dumps(self.serialize(), *args, **kwargs)
[docs]class EntityList(Sequence): """Wrapper around a list of Entities to facilitate operations on all at once."""
[docs] def __init__(self, *entities): self.entities = list(entities)
def __getitem__(self, index): return self.entities[index] def __len__(self): return len(self.entities)
[docs] def serialize(self): """Serialize to a list of python dictionaries.""" return [e.serialize() for e in self.entities]
[docs] def to_json(self, *args, **kwargs): """Convert EntityList to JSON.""" return json.dumps(self.serialize(), *args, **kwargs)
[docs]class DocumentEntity(Entity): """Generic document entity.""" doi = StringField('//meta[@name="citation_doi"]/@content | //meta[@name="dc.identifier"]/@content | //meta[@name="DC.identifier"]/@content | //meta[@name="DC.Identifier"]/@content | //meta[@name="dc.Identifier"]/@content', xpath=True, lower=True) title = StringField('//meta[@name="citation_title"]/@content | //meta[@name="dc.title"]/@content | //meta[@name="DC.title"]/@content | //meta[@name="DC.Title"]/@content | //meta[@name="dc.Title"]/@content | //meta[@name="title"]/@content', xpath=True, strip=True) authors = StringField('//meta[@name="citation_author"]/@content | //meta[@name="dc.creator"]/@content | //meta[@name="DC.creator"]/@content | //meta[@name="DC.Creator"]/@content | //meta[@name="dc.Creator"]/@content', xpath=True, all=True) published_date = DateTimeField('//meta[@name="citation_publication_date"]/@content | //meta[@name="prism.publicationDate"]/@content | //meta[@name="citation_date"]/@content | //meta[@name="dc.date"]/@content | //meta[@name="DC.date"]/@content | //meta[@name="DC.Date"]/@content | //meta[@name="dc.Date"]/@content', xpath=True) online_date = DateTimeField('//meta[@name="citation_online_date"]/@content', xpath=True) journal = StringField('//meta[@name="citation_journal_title"]/@content | //meta[@name="citation_journal_abbrev"]/@content | //meta[@name="prism.publicationName"]/@content | //meta[@name="dc.source"]/@content | //meta[@name="DC.source"]/@content | //meta[@name="DC.Source"]/@content', xpath=True, strip=True) volume = StringField('//meta[@name="citation_volume"]/@content | //meta[@name="prism.volume"]/@content', xpath=True) issue = StringField('//meta[@name="citation_issue"]/@content | //meta[@name="prism.number"]/@content | //meta[@name="citation_technical_report_number"]/@content', xpath=True) firstpage = StringField('//meta[@name="citation_firstpage"]/@content | //meta[@name="prism.startingPage"]/@content', xpath=True) lastpage = StringField('//meta[@name="citation_lastpage"]/@content', xpath=True) abstract = StringField('//meta[@name="citation_abstract"]/@content', xpath=True, strip=True) publisher = StringField('//meta[@name="citation_publisher"]/@content | //meta[@name="dc.publisher"]/@content | //meta[@name="DC.publisher"]/@content | //meta[@name="dc.Publisher"]/@content | //meta[@name="DC.Publisher"]/@content', xpath=True) issn = StringField('//meta[@name="citation_issn"]/@content | //meta[@name="prism.issn"]/@content', xpath=True) language = StringField('//meta[@name="citation_language"]/@content | //meta[@name="dc.language"]/@content | //meta[@name="DC.language"] | //meta[@name="DC.Language"]/@content', xpath=True) copyright = StringField('//meta[@name="dc.copyright"]/@content | //meta[@name="DC.copyright"]/@content | //meta[@name="DC.Copyright"]/@content | //meta[@name="prism.copyright"]/@content', xpath=True) license = UrlField('//a[@rel="license"]/@href', xpath=True) html_url = UrlField('//meta[@name="citation_fulltext_html_url"]/@content', xpath=True) pdf_url = UrlField('//meta[@name="citation_pdf_url"]/@content', xpath=True) landing_url = UrlField('//meta[@name="citation_abstract_html_url"]/@content', xpath=True) process_title = normalize process_journal = normalize process_publisher = normalize process_authors = normalize process_abstract = normalize
# TODO: Abbreviations: <abbr title="Australia">AU</abbr>