Source code for chemdataextractor.doc.figure

# -*- coding: utf-8 -*-
"""
Figure document elements.
:codeauthor:: Callum Court ([email protected])
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging

from .element import CaptionedElement
from ..model import ModelList, Compound
log = logging.getLogger(__name__)


[docs]class Figure(CaptionedElement):
[docs] def __init__(self, caption, label=None, links=None, models=None, **kwargs): """ Create a new Figure element, to interface with FDE """ super(Figure, self).__init__(caption=caption, label=label, models=models, **kwargs) self.links = links self.caption_tokens = [] for sent in self.caption.sentences: self.caption_tokens.extend(sent.tagged_tokens)
@property def records(self): """ Return FigureData records Returns: [type] -- [description] """ records = ModelList() seen_labels = set() p = None for model in self._streamlined_models: for parser in model.parsers: parser_records = [] if hasattr(parser, 'parse_caption'): for record in parser.parse_caption(self.caption_tokens, self.label, self.links): parser_records.append(record) elif hasattr(parser, 'parse_sentence'): for record in parser.parse_sentence(self.caption_tokens): parser_records.append(record) else: continue if not parser_records: continue for record in parser_records: p = record.serialize() if not p: # TODO: Potential performance issues? continue # Skip duplicate records if record in records: continue # Skip just labels that have already been seen (bit of a hack) if (isinstance(record, Compound) and 'Compound' in p.keys() and all(k in {'labels', 'roles'} for k in p['Compound'].keys()) and set(record.labels).issubset(seen_labels)): continue if isinstance(record, Compound): seen_labels.update(record.labels) # This could be super slow if we find lots of things found = False for seen_record in records: if (isinstance(seen_record, Compound) and (not set(record.names).isdisjoint(seen_record.names) or not set(record.labels).isdisjoint(seen_record.labels))): seen_record.names = sorted(list(set(seen_record.names).union(record.names))) seen_record.labels = sorted(list(set(seen_record.labels).union(record.labels))) seen_record.roles = sorted(list(set(seen_record.roles).union(record.roles))) found = True if found: continue elif hasattr(record, 'compound') and record.compound is not None: seen_labels.update(record.compound.labels) records.append(record) i = 0 length = len(records) while i < length: j = 0 while j < length: if i != j: records[j].merge_all(records[i]) j += 1 i += 1 return records def _repr_html_(self): html_lines = ['<figure>', self.caption._repr_html_(), '</figure>'] # TODO: img element with figure URL return '\n'.join(html_lines)