# -*- coding: utf-8 -*-
"""
Document elements.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from abc import ABCMeta, abstractproperty
import json
import operator
import six
from ..utils import python_2_unicode_compatible
import logging
log = logging.getLogger(__name__)
[docs]@python_2_unicode_compatible
class BaseElement(six.with_metaclass(ABCMeta)):
"""
Abstract base class for a Document Element.
:ivar id: (Optional) An identifier for this Element.
:type id: Any or None
:ivar list[chemdataextractor.models.BaseModel] models: A list of models that this element will parse
"""
[docs] def __init__(self, document=None, references=None, id=None, models=None, **kwargs):
"""
.. note::
If intended as part of a :class:`~chemdataextractor.doc.document.Document`,
an element should either be initialized with a reference to its containing document,
or its :attr:`document` attribute should be set as soon as possible.
If the element is being passed in to a :class:`~chemdataextractor.doc.document.Document`
to initialise it, the :attr:`document` attribute is automatically set
during the initialisation of the document, so the user does not need to worry about this.
:param Document document: (Optional) The document containing this element.
:param list[Citation] references: (Optional) Any references contained in the element.
:param Any id: (Optional) An identifier for this element. Must be equatable.
:param list[chemdataextractor.models.BaseModel] models: (Optional) A list of models for this element to parse.
If the element is part of another element (e.g. a :class:`~chemdataextractor.doc.text.Sentence`
inside a :class:`~chemdataextractor.doc.text.Paragraph`), or is part of a :class:`chemdataextractor.doc.document.Document`,
this is set automatically to be the same as that of the containing element, unless manually set otherwise.
"""
#: The containing Document
self._document = document
self.id = id
self.references = references if references is not None else []
if models:
self.models = models
else:
self.models = []
self._streamlined_models_list = None
def __repr__(self):
return '<%s>' % (self.__class__.__name__,)
def __str__(self):
return '<%s>' % (self.__class__.__name__,)
@property
def document(self):
""" The :class:`chemdataextractor.doc.document.Document` that this element belongs to. """
return self._document
@document.setter
def document(self, document):
# Subclasses may need to override this and also assign the document to sub-elements
self._document = document
# If we have problems with garbage collection, use a weakref to document to avoid circular references:
# try:
# self._document = weakref.proxy(document)
# except TypeError:
# self._document = document
@abstractproperty
def records(self):
"""All records found in this Element, as a :class:`chemdataextractor.model.base.ModelList` of :class:`chemdataextractor.model.base.BaseModel`."""
return []
# @abstractmethod # TODO: Put this back?
# def serialize(self):
# """Convert Element to python dictionary."""
# return []
[docs] def add_models(self, models):
"""Set all models on this element
"""
# print(models)
log.debug("Setting models on %s" % self)
self._streamlined_models_list = None
self.models.extend(models)
self.models = self.models
@property
def models(self):
return self._models
@models.setter
def models(self, value):
self._models = value
self._streamlined_models_list = None
@property
def _streamlined_models(self):
if self._streamlined_models_list is None:
models = set()
log.debug(self.models)
for model in self.models:
models.update(model.flatten(include_inferred=False))
self._streamlined_models_list = sorted(list(models),
key=operator.attrgetter('__name__'))
for model in self._streamlined_models_list:
for parser in model.parsers:
parser.model = model
return self._streamlined_models_list
[docs] def to_json(self, *args, **kwargs):
"""Convert element to JSON string. The content of the JSON will be equivalent
to that of :meth:`serialize`."""
return json.dumps(self.serialize(), *args, **kwargs)
@property
def elements(self):
"""
A list of child elements. Returns None by default.
"""
return None
[docs]@python_2_unicode_compatible
class CaptionedElement(BaseElement):
"""
Document Element with a caption.
:ivar BaseElement caption: The caption for this element.
"""
[docs] def __init__(self, caption, label=None, **kwargs):
"""
.. note::
If intended as part of a :class:`~chemdataextractor.doc.document.Document`,
an element should either be initialized with a reference to its containing document,
or its :attr:`document` attribute should be set as soon as possible.
If the element is being passed in to a :class:`~chemdataextractor.doc.document.Document`
to initialise it, the :attr:`document` attribute is automatically set
during the initialisation of the document, so the user does not need to worry about this.
:param BaseElement caption: The caption for the element.
:param Document document: (Optional) The document containing this element.
:param str label: (Optional) The label for the captioned element, e.g. Table 1 would have a label of 1.
:param Any id: (Optional) Some identifier for this element. Must be equatable.
:param list[chemdataextractor.models.BaseModel] models: (Optional) A list of models for this element to parse.
If the element is part of another element (e.g. a :class:`~chemdataextractor.doc.text.Sentence`
inside a :class:`~chemdataextractor.doc.text.Paragraph`), or is part of a :class:`~chemdataextractor.doc.document.Document`,
this is set automatically to be the same as that of the containing element, unless manually set otherwise.
"""
# TODO: docs for label
self.caption = caption
self.label = label
super(CaptionedElement, self).__init__(**kwargs)
self.caption.document = self.document
def __repr__(self):
return '%s(id=%r, references=%r, caption=%r)' % (self.__class__.__name__, self.id, self.references, self.caption.text)
def __str__(self):
return self.caption.text
@property
def document(self):
""" The :class:`~chemdataextractor.doc.document.Document` that this element belongs to. """
return self._document
@document.setter
def document(self, document):
self._document = document
self.caption.document = document
@property
def records(self):
"""All records found in the object, as a list of :class:`~chemdataextractor.model.base.BaseModel`."""
# This just passes the caption records. Subclasses may wish to extend this.
return self.caption.records
@property
def abbreviation_definitions(self):
"""
A list of all abbreviation definitions in this Document. Each abbreviation is in the form
(:class:`str` abbreviation, :class:`str` long form of abbreviation, :class:`str` ner_tag)
"""
return self.caption.abbreviation_definitions
@property
def ner_tags(self):
"""
A list of all Named Entity Recognition tags in the caption for this element.
If a word was found not to be a named entity, the named entity tag is None,
and if it was found to be a named entity, it can have either a tag of 'B-CM' for a beginning of a
mention of a chemical or 'I-CM' for the continuation of a mention.
"""
# TODO: Delete this method?
return self.caption.ner_tags
@property
def cems(self):
"""
A list of all Chemical Entity Mentions in this document as :class:`~chemdataextractor.doc.text.Span`
"""
return self.caption.cems
@property
def definitions(self):
"""Return a list of all specifier definitions in the caption
Returns:
list-- The specifier definitions
"""
return self.caption.definitions
@property
def chemical_definitions(self):
return self.caption.chemical_definitions
@property
def models(self):
return self._models
@models.setter
def models(self, value):
self._models = value
self.caption.models = value
[docs] def serialize(self):
"""
Convert self to a dictionary. The key 'type' will contain
the name of the class being serialized, and the key 'caption' will contain
a serialized representation of :attr:`caption`, which is a :class:`~chemdataextractor.doc.element.BaseElement`
"""
data = {'type': self.__class__.__name__, 'caption': self.caption.serialize()}
return data
@property
def elements(self):
return [self.caption]