Source code for chemdataextractor.doc.figure

# -*- coding: utf-8 -*-
"""
Figure document elements.
:codeauthor:: Callum Court ([email protected])
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging

from .element import CaptionedElement
from ..model import ModelList, Compound
log = logging.getLogger(__name__)


[docs]class Figure(CaptionedElement):
[docs] def __init__(self, caption, label=None, links=None, models=None, **kwargs): """ Create a new Figure element, to interface with FDE """ super(Figure, self).__init__(caption=caption, label=label, models=models, **kwargs) self.links = links self.caption_tokens = None
@property def records(self): """ Return FigureData records Returns: [type] -- [description] """ records = ModelList() seen_labels = set() skip_parsers = self.document.skip_parsers if self.document is not None else [] p = None if not self.caption_tokens: self.caption_tokens = [] for sent in self.caption.sentences: self.caption_tokens.extend(sent.tokens) for model in self._streamlined_models: for parser in model.parsers: if parser in skip_parsers: # print(f"Figure Skipped: {parser.model}") continue parser_records = [] if hasattr(parser, 'parse_caption'): for record in parser.parse_caption(self.caption, self.label, self.links): parser_records.append(record) elif hasattr(parser, 'parse_sentence'): for caption_sentence in self.caption.sentences: for subsentence in caption_sentence.subsentences: for record in parser.parse_sentence(subsentence): parser_records.append(record) else: continue if not parser_records: continue for record in parser_records: p = record.serialize() if not p: # TODO: Potential performance issues? continue # Skip duplicate records if record in records: continue # Skip just labels that have already been seen (bit of a hack) if (isinstance(record, Compound) and 'Compound' in p.keys() and all(k in {'labels', 'roles'} for k in p['Compound'].keys()) and set(record.labels).issubset(seen_labels)): continue if isinstance(record, Compound): seen_labels.update(record.labels) # This could be super slow if we find lots of things found = False for seen_record in records: if (isinstance(seen_record, Compound) and (not set(record.names).isdisjoint(seen_record.names) or not set(record.labels).isdisjoint(seen_record.labels))): seen_record.names = sorted(list(set(seen_record.names).union(record.names))) seen_record.labels = sorted(list(set(seen_record.labels).union(record.labels))) seen_record.roles = sorted(list(set(seen_record.roles).union(record.roles))) found = True if found: continue elif hasattr(record, 'compound') and record.compound is not None: seen_labels.update(record.compound.labels) records.append(record) i = 0 length = len(records) while i < length: j = 0 while j < length: if i != j: records[j].merge_all(records[i]) j += 1 i += 1 cleaned_records = [] for record in records: record._clean(clean_contextual=False) if record.noncontextual_required_fulfilled: cleaned_records.append(record) sorted_records = ModelList(*sorted(cleaned_records, key=lambda el: el.total_confidence() if el.total_confidence() is not None else -10000, reverse=True)) return sorted_records def _repr_html_(self): html_lines = ['<figure>', self.caption._repr_html_(), '</figure>'] # TODO: img element with figure URL return '\n'.join(html_lines)