Source code for chemdataextractor.doc.element

# -*- coding: utf-8 -*-
"""
Document elements.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from abc import ABCMeta, abstractproperty
import json
import operator

import six

from ..utils import python_2_unicode_compatible
import logging
log = logging.getLogger(__name__)


[docs]@python_2_unicode_compatible class BaseElement(six.with_metaclass(ABCMeta)): """ Abstract base class for a Document Element. :ivar id: (Optional) An identifier for this Element. :type id: Any or None :ivar list[chemdataextractor.models.BaseModel] models: A list of models that this element will parse """
[docs] def __init__(self, document=None, references=None, id=None, models=None, **kwargs): """ .. note:: If intended as part of a :class:`~chemdataextractor.doc.document.Document`, an element should either be initialized with a reference to its containing document, or its :attr:`document` attribute should be set as soon as possible. If the element is being passed in to a :class:`~chemdataextractor.doc.document.Document` to initialise it, the :attr:`document` attribute is automatically set during the initialisation of the document, so the user does not need to worry about this. :param Document document: (Optional) The document containing this element. :param list[Citation] references: (Optional) Any references contained in the element. :param Any id: (Optional) An identifier for this element. Must be equatable. :param list[chemdataextractor.models.BaseModel] models: (Optional) A list of models for this element to parse. If the element is part of another element (e.g. a :class:`~chemdataextractor.doc.text.Sentence` inside a :class:`~chemdataextractor.doc.text.Paragraph`), or is part of a :class:`chemdataextractor.doc.document.Document`, this is set automatically to be the same as that of the containing element, unless manually set otherwise. """ #: The containing Document self._document = document self.id = id self.references = references if references is not None else [] if models: self.models = models else: self.models = [] self._streamlined_models_list = None
def __repr__(self): return '<%s>' % (self.__class__.__name__,) def __str__(self): return '<%s>' % (self.__class__.__name__,) @property def document(self): """ The :class:`chemdataextractor.doc.document.Document` that this element belongs to. """ return self._document @document.setter def document(self, document): # Subclasses may need to override this and also assign the document to sub-elements self._document = document # If we have problems with garbage collection, use a weakref to document to avoid circular references: # try: # self._document = weakref.proxy(document) # except TypeError: # self._document = document @abstractproperty def records(self): """All records found in this Element, as a :class:`chemdataextractor.model.base.ModelList` of :class:`chemdataextractor.model.base.BaseModel`.""" return [] # @abstractmethod # TODO: Put this back? # def serialize(self): # """Convert Element to python dictionary.""" # return []
[docs] def add_models(self, models): """Set all models on this element """ # print(models) log.debug("Setting models on %s" % self) self._streamlined_models_list = None self.models.extend(models) self.models = self.models
@property def models(self): return self._models @models.setter def models(self, value): self._models = value self._streamlined_models_list = None @property def _streamlined_models(self): if self._streamlined_models_list is None: models = set() log.debug(self.models) for model in self.models: models.update(model.flatten(include_inferred=False)) self._streamlined_models_list = sorted(list(models), key=operator.attrgetter('__name__')) for model in self._streamlined_models_list: for parser in model.parsers: parser.model = model return self._streamlined_models_list
[docs] def to_json(self, *args, **kwargs): """Convert element to JSON string. The content of the JSON will be equivalent to that of :meth:`serialize`.""" return json.dumps(self.serialize(), *args, **kwargs)
@property def elements(self): """ A list of child elements. Returns None by default. """ return None
[docs]@python_2_unicode_compatible class CaptionedElement(BaseElement): """ Document Element with a caption. :ivar BaseElement caption: The caption for this element. """
[docs] def __init__(self, caption, label=None, **kwargs): """ .. note:: If intended as part of a :class:`~chemdataextractor.doc.document.Document`, an element should either be initialized with a reference to its containing document, or its :attr:`document` attribute should be set as soon as possible. If the element is being passed in to a :class:`~chemdataextractor.doc.document.Document` to initialise it, the :attr:`document` attribute is automatically set during the initialisation of the document, so the user does not need to worry about this. :param BaseElement caption: The caption for the element. :param Document document: (Optional) The document containing this element. :param str label: (Optional) The label for the captioned element, e.g. Table 1 would have a label of 1. :param Any id: (Optional) Some identifier for this element. Must be equatable. :param list[chemdataextractor.models.BaseModel] models: (Optional) A list of models for this element to parse. If the element is part of another element (e.g. a :class:`~chemdataextractor.doc.text.Sentence` inside a :class:`~chemdataextractor.doc.text.Paragraph`), or is part of a :class:`~chemdataextractor.doc.document.Document`, this is set automatically to be the same as that of the containing element, unless manually set otherwise. """ # TODO: docs for label self.caption = caption self.label = label super(CaptionedElement, self).__init__(**kwargs) self.caption.document = self.document
def __repr__(self): return '%s(id=%r, references=%r, caption=%r)' % (self.__class__.__name__, self.id, self.references, self.caption.text) def __str__(self): return self.caption.text @property def document(self): """ The :class:`~chemdataextractor.doc.document.Document` that this element belongs to. """ return self._document @document.setter def document(self, document): self._document = document self.caption.document = document @property def records(self): """All records found in the object, as a list of :class:`~chemdataextractor.model.base.BaseModel`.""" # This just passes the caption records. Subclasses may wish to extend this. return self.caption.records @property def abbreviation_definitions(self): """ A list of all abbreviation definitions in this Document. Each abbreviation is in the form (:class:`str` abbreviation, :class:`str` long form of abbreviation, :class:`str` ner_tag) """ return self.caption.abbreviation_definitions @property def ner_tags(self): """ A list of all Named Entity Recognition tags in the caption for this element. If a word was found not to be a named entity, the named entity tag is None, and if it was found to be a named entity, it can have either a tag of 'B-CM' for a beginning of a mention of a chemical or 'I-CM' for the continuation of a mention. """ # TODO: Delete this method? return self.caption.ner_tags @property def cems(self): """ A list of all Chemical Entity Mentions in this document as :class:`~chemdataextractor.doc.text.Span` """ return self.caption.cems @property def definitions(self): """Return a list of all specifier definitions in the caption Returns: list-- The specifier definitions """ return self.caption.definitions @property def chemical_definitions(self): return self.caption.chemical_definitions @property def models(self): return self._models @models.setter def models(self, value): self._models = value self.caption.models = value
[docs] def serialize(self): """ Convert self to a dictionary. The key 'type' will contain the name of the class being serialized, and the key 'caption' will contain a serialized representation of :attr:`caption`, which is a :class:`~chemdataextractor.doc.element.BaseElement` """ data = {'type': self.__class__.__name__, 'caption': self.caption.serialize()} return data
@property def elements(self): return [self.caption]