Source code for chemdataextractor.doc.text

# -*- coding: utf-8 -*-
"""
Text-based document elements.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from abc import abstractproperty
import collections
import logging
import re
from pprint import pprint
from deprecation import deprecated
import unicodedata

import six

from ..model.base import ModelList, sort_merge_candidates
from ..nlp.lexicon import ChemLexicon, Lexicon
from ..nlp.cem import IGNORE_PREFIX, IGNORE_SUFFIX, SPECIALS, SPLITS, CiDictCemTagger, CsDictCemTagger, CrfCemTagger
from ..nlp.new_cem import CemTagger
from ..nlp.abbrev import ChemAbbreviationDetector
from ..nlp.tag import NoneTagger, POS_TAG_TYPE, NER_TAG_TYPE
from ..nlp.pos import ChemCrfPosTagger, CrfPosTagger, ApPosTagger, ChemApPosTagger
# from ..nlp.tokenize import ChemSentenceTokenizer, ChemWordTokenizer, regex_span_tokenize, SentenceTokenizer, WordTokenizer, FineWordTokenizer, ChemTokWordTokenizer, SpacyTokenizer
from ..nlp.tokenize import BertWordTokenizer, ChemSentenceTokenizer, regex_span_tokenize, SentenceTokenizer, WordTokenizer
from ..nlp.subsentence import SubsentenceExtractor, NoneSubsentenceExtractor
from ..nlp.dependency import DependencyTagger, IndexTagger
from ..text import CONTROL_RE
from ..utils import memoized_property, python_2_unicode_compatible, first
from .element import BaseElement
from ..parse.definitions import specifier_definition
from ..parse.cem import chemical_name, cem_phrase
from ..parse.quantity import construct_quantity_re
from ..model.model import Compound, NmrSpectrum, IrSpectrum, UvvisSpectrum, MeltingPoint, GlassTransition
from ..model.contextual_range import SentenceRange



log = logging.getLogger(__name__)
cem_tagger = CemTagger()


[docs]@python_2_unicode_compatible
class BaseText(BaseElement):
    """Abstract base class for a text Document Element."""
    taggers = []
    """
    A list of :class:`~chemdataextractor.nlp.tag.BaseTagger` instances. This is a list of taggers
    that will be called by ChemDataExtractor to assign tags to each of the tokens in this element.
    """

[docs]    def __init__(self, text, word_tokenizer=None, lexicon=None, abbreviation_detector=None, pos_tagger=None, ner_tagger=None, taggers=None, **kwargs):
        """
        .. note::

            If intended as part of a :class:`~chemdataextractor.doc.document.Document`,
            an element should either be initialized with a reference to its containing document,
            or its :attr:`document` attribute should be set as soon as possible.
            If the element is being passed in to a :class:`~chemdataextractor.doc.document.Document`
            to initialise it, the :attr:`document` attribute is automatically set
            during the initialisation of the document, so the user does not need to worry about this.

        :param str text: The text contained in this element.
        :param WordTokenizer word_tokenizer: (Optional) Word tokenizer for this element.
        :param Lexicon lexicon: (Optional) Lexicon for this element. The lexicon stores all the occurences of unique words and can provide
            Brown clusters for the words.
        :param AbbreviationDetector abbreviation_detector: (Optional) The abbreviation detector for this element.
        :param BaseTagger pos_tagger: (Optional) The part of speech tagger for this element.
        :param BaseTagger ner_tagger: (Optional) The named entity recognition tagger for this element.
        :param Document document: (Optional) The document containing this element.
        :param str label: (Optional) The label for the captioned element, e.g. Table 1 would have a label of 1.
        :param Any id: (Optional) Some identifier for this element. Must be equatable.
        :param list[chemdataextractor.models.BaseModel] models: (Optional) A list of models for this element to parse.
            If the element is part of another element (e.g. a :class:`~chemdataextractor.doc.text.Sentence`
            inside a :class:`~chemdataextractor.doc.text.Paragraph`), or is part of a :class:`~chemdataextractor.doc.document.Document`,
            this is set automatically to be the same as that of the containing element, unless manually set otherwise.
        """
        if not isinstance(text, six.text_type):
            raise TypeError('Text must be a unicode string')
        super(BaseText, self).__init__(**kwargs)
        self._text = text
        self.word_tokenizer = word_tokenizer if word_tokenizer is not None else self.word_tokenizer
        self.lexicon = lexicon if lexicon is not None else self.lexicon
        self.abbreviation_detector = abbreviation_detector if abbreviation_detector is not None else self.abbreviation_detector
        if taggers is not None and len(taggers) != 0:
            self.taggers = taggers

    def __repr__(self):
        return '%s(id=%r, references=%r, text=%r)' % (self.__class__.__name__, self.id, self.references, self._text)

    def __str__(self):
        return self._text

    @property
    def text(self):
        """The raw text :class:`str` for this passage of text."""
        return self._text

    @abstractproperty
    def word_tokenizer(self):
        """The :class:`~chemdataextractor.nlp.tokenize.WordTokenizer` used by this element."""
        return

    @abstractproperty
    def lexicon(self):
        """The :class:`~chemdataextractor.nlp.lexicon.Lexicon` used by this element."""
        return

    @property
    def pos_tagger(self):
        """The part of speech tagger used by this element. A subclass of :class:`~chemdataextractor.nlp.tag.BaseTagger`"""
        for tagger in reversed(self.taggers):
            if tagger.can_tag(POS_TAG_TYPE):
                return tagger

    @pos_tagger.setter
    @deprecated(deprecated_in="2.1", details="Deprecated due to the introduction of taggers API. Consult the migration guide for more information.")
    def pos_tagger(self, value):
        if not value.can_tag(POS_TAG_TYPE):
            value.tag_type = POS_TAG_TYPE
        self.taggers.append(value)

    @property
    def ner_tagger(self):
        """The named entity recognition tagger used by this element. A subclass of :class:`~chemdataextractor.nlp.tag.BaseTagger`"""
        for tagger in reversed(self.taggers):
            if tagger.can_tag(NER_TAG_TYPE):
                return tagger

    @ner_tagger.setter
    @deprecated(deprecated_in="2.1", details="Deprecated due to the introduction of taggers API. Consult the migration guide for more information.")
    def ner_tagger(self, value):
        if not value.can_tag(NER_TAG_TYPE):
            value.tag_type = NER_TAG_TYPE
        self.taggers.append(value)

    @abstractproperty
    def tokens(self):
        """A list of :class:`RichToken` s for this object."""
        return

    @abstractproperty
    def tags(self):
        """
        A list of tags corresponding to each of the tokens in the object.
        For information on what each of the tags can be, check the documentation on
        the specific :attr:`ner_tagger` and :attr:`pos_tagger` used for this class.
        """
        return

    @abstractproperty
    def definitions(self):
        """
        A list of all specifier definitions
        """
        return

    @abstractproperty
    def chemical_definitions(self):
        """A list of all chemical label definitiond
        """
        return

[docs]    def serialize(self):
        """
        Convert self to a dictionary. The key 'type' will contain
        the name of the class being serialized, and the key 'content' will contain
        a serialized representation of :attr:`text`, which is a :class:`str`
        """
        data = {'type': self.__class__.__name__, 'content': self.text}
        return data

    def _repr_html_(self):
        return self.text


[docs]class Text(collections.Sequence, BaseText):
    """A passage of text, comprising one or more sentences."""

    sentence_tokenizer = ChemSentenceTokenizer()
    word_tokenizer = BertWordTokenizer()
    lexicon = ChemLexicon()
    abbreviation_detector = ChemAbbreviationDetector()
    taggers = [ChemCrfPosTagger(), cem_tagger, DependencyTagger()]
    subsentence_extractor = None

[docs]    def __init__(self, text, sentence_tokenizer=None, word_tokenizer=None, lexicon=None, abbreviation_detector=None, pos_tagger=None, ner_tagger=None, parsers=None, **kwargs):
        """
        .. note::

            If intended as part of a :class:`~chemdataextractor.doc.document.Document`,
            an element should either be initialized with a reference to its containing document,
            or its :attr:`document` attribute should be set as soon as possible.
            If the element is being passed in to a :class:`~chemdataextractor.doc.document.Document`
            to initialise it, the :attr:`document` attribute is automatically set
            during the initialisation of the document, so the user does not need to worry about this.

        :param str text: The text contained in this element.
        :param SentenceTokenizer sentence_tokenizer: (Optional) Sentence tokenizer for this element.
            Default :class:`~chemdataextractor.nlp.tokenize.ChemSentenceTokenizer`.
        :param WordTokenizer word_tokenizer: (Optional) Word tokenizer for this element.
            Default :class:`~chemdataextractor.nlp.tokenize.ChemWordTokenizer`.
        :param Lexicon lexicon: (Optional) Lexicon for this element. The lexicon stores all the occurences of unique words and can provide
            Brown clusters for the words. Default :class:`~chemdataextractor.nlp.lexicon.ChemLexicon`
        :param AbbreviationDetector abbreviation_detector: (Optional) The abbreviation detector for this element.
            Default :class:`~chemdataextractor.nlp.abbrev.ChemAbbreviationDetector`.
        :param BaseTagger pos_tagger: (Optional) The part of speech tagger for this element.
            Default :class:`~chemdataextractor.nlp.pos.ChemCrfPosTagger`.
        :param BaseTagger ner_tagger: (Optional) The named entity recognition tagger for this element.
            Default :class:`~chemdataextractor.nlp.cem.CemTagger`
        :param Document document: (Optional) The document containing this element.
        :param str label: (Optional) The label for the captioned element, e.g. Table 1 would have a label of 1.
        :param Any id: (Optional) Some identifier for this element. Must be equatable.
        :param list[chemdataextractor.models.BaseModel] models: (Optional) A list of models for this element to parse.
            If the element is part of another element (e.g. a :class:`~chemdataextractor.doc.text.Sentence`
            inside a :class:`~chemdataextractor.doc.text.Paragraph`), or is part of a :class:`~chemdataextractor.doc.document.Document`,
            this is set automatically to be the same as that of the containing element, unless manually set otherwise.
        """
        super(Text, self).__init__(text, word_tokenizer=word_tokenizer, lexicon=lexicon, abbreviation_detector=abbreviation_detector, pos_tagger=pos_tagger, ner_tagger=ner_tagger, parsers=None, **kwargs)
        self.sentence_tokenizer = sentence_tokenizer if sentence_tokenizer is not None else self.sentence_tokenizer

    def __getitem__(self, index):
        return self.sentences[index]

    def __len__(self):
        return len(self.sentences)

[docs]    def set_config(self):
        """ Load settings from configuration file

        .. note:: Called when Document instance is created
        """

        if self.document is None:
            pass
        else:
            c = self.document.config
            if 'SENTENCE_TOKENIZER' in c.keys():
                self.sentence_tokenizer = eval(c['SENTENCE_TOKENIZER'])()
            if 'WORD_TOKENIZER' in c.keys():
                self.word_tokenizer = eval(c['WORD_TOKENIZER'])()
            if 'POS_TAGGER' in c.keys():
                self.pos_tagger = eval(c['POS_TAGGER'])()
            if 'NER_TAGGER' in c.keys():
                self.ner_tagger = eval(c['NER_TAGGER'])()
            if 'LEXICON' in c.keys():
                self.lexicon = eval(c['LEXICON'])()
            if 'PARSERS' in c.keys():
                raise(DeprecationWarning('Manually setting parsers deprecated, any settings from config files for this will be ignored.'))

    @memoized_property
    def sentences(self):
        """A list of :class:`Sentence` s that make up this text passage."""
        sents = self.sentence_tokenizer.get_sentences(self)
        for sent in sents:
            sent.document = self.document
        return sents

    @property
    def elements(self):
        return self.sentences

    def _sentences_from_spans(self, spans):
        sents = []
        for span in spans:
            sent = Sentence(
                text=self.text[span[0]:span[1]],
                start=span[0],
                end=span[1],
                word_tokenizer=self.word_tokenizer,
                lexicon=self.lexicon,
                abbreviation_detector=self.abbreviation_detector,
                pos_tagger=self.pos_tagger,
                ner_tagger=self.ner_tagger,
                document=self.document,
                models=self.models,
                taggers=self.taggers,
                subsentence_extractor=self.subsentence_extractor,
            )
            sents.append(sent)
        return sents

    @property
    def raw_sentences(self):
        """A list of :class:`str` for the sentences that make up this text passage."""
        return [sentence.text for sentence in self.sentences]

    @property
    def tokens(self):
        return [sent.tokens for sent in self.sentences]

    @property
    def raw_tokens(self):
        """A list of :class:`str` representations for the tokens of each sentence in this text passage."""
        return [sent.raw_tokens for sent in self.sentences]

    @property
    def pos_tagged_tokens(self):
        """A list of (:class:`Token` token, :class:`str` tag) tuples for each sentence in this text passage."""
        return [sent.pos_tagged_tokens for sent in self.sentences]

    @property
    def pos_tags(self):
        """A list of :class:`str` part of speech tags for each sentence in this text passage."""
        return [sent.pos_tags for sent in self.sentences]

    @property
    def unprocessed_ner_tagged_tokens(self):
        """
        A list of (:class:`Token` token, :class:`str` named entity recognition tag)
        from the text.

        No corrections from abbreviation detection are performed.
        """
        return [sent.unprocessed_ner_tagged_tokens for sent in self.sentences]

    @property
    def unprocessed_ner_tags(self):
        """
        A list of :class:`str` unprocessed named entity tags for the tokens in this sentence.

        No corrections from abbreviation detection are performed.
        """
        return [sent.unprocessed_ner_tags for sent in self.sentences]

    @property
    def ner_tagged_tokens(self):
        """
        A list of (:class:`Token` token, :class:`str` named entity recognition tag)
        from the text.
        """
        return [sent.ner_tagged_tokens for sent in self.sentences]

    @property
    def ner_tags(self):
        """
        A list of named entity tags corresponding to each of the tokens in the object.
        For information on what each of the tags can be, check the documentation on
        the specific :attr:`ner_tagger` used for this object.
        """
        return [sent.ner_tags for sent in self.sentences]

    @property
    def cems(self):
        """
        A list of all Chemical Entity Mentions in this text as :class:`chemdataextractor.doc.text.span`
        """
        return [cem for sent in self.sentences for cem in sent.cems]

    @property
    def definitions(self):
        """
        Return a list of tagged definitions for each sentence in this text passage
        """
        return [definition for sent in self.sentences for definition in sent.definitions]

    @property
    def chemical_definitions(self):
        """
        Return a list of tagged definitions for each sentence in this text passage
        """
        return [definition for sent in self.sentences for definition in sent.chemical_definitions]

    @property
    @deprecated(deprecated_in="2.1", details="Deprecated due to the introduction of RichTokens, and is now just an alias for .tokens.")
    def tagged_tokens(self):
        """
        A list of lists of :class:`~chemdataextractor.doc.text.RichToken` instances found in the text.
        """
        return [sent.tagged_tokens for sent in self.sentences]

    @property
    def tags(self):
        return [sent.tags for sent in self.sentences]

    @property
    def abbreviation_definitions(self):
        """
        A list of all abbreviation definitions in this Document. Each abbreviation is in the form
        (:class:`str` abbreviation, :class:`str` long form of abbreviation, :class:`str` ner_tag)
        """
        return [ab for sent in self.sentences for ab in sent.abbreviation_definitions]

    @property
    def records(self):
        """All records found in the object, as a list of :class:`~chemdataextractor.model.base.BaseModel`."""
        records_by_sentence = [sent.records for sent in self.sentences]
        num_sentences = len(records_by_sentence)

        for index, sent_records in enumerate(records_by_sentence):
            offset = 1
            max_offset = max(num_sentences - index, index)
            merge_candidates = []
            while offset <= max_offset:
                backwards_index = index - offset
                forwards_index = index + offset
                distance = offset * SentenceRange()
                merge_candidates.extend([(distance, record) for record in records_by_sentence[backwards_index]])

                if backwards_index >= 0:
                    merge_candidates.extend((distance, record) for record in records_by_sentence[backwards_index])
                if forwards_index < num_sentences:
                    merge_candidates.extend((distance, record) for record in records_by_sentence[forwards_index])
                offset += 1
            self._resolve_contextual(sent_records, sort_merge_candidates(merge_candidates))

        # Don't sort these records as this encodes where they were found in the paragraph
        records = ModelList(*[r for sentence_records in records_by_sentence for r in sentence_records])
        records.remove_subsets()
        return records

    def _resolve_contextual(self, parent_records, child_records):
        for parent_record in parent_records:
            for distance, child_record in child_records:
                parent_record.merge_contextual(child_record, distance=distance)

    def __add__(self, other):
        if type(self) == type(other):
            merged = self.__class__(
                text=self.text + other.text,
                id=self.id or other.id,
                references=self.references + other.references,
                sentence_tokenizer=self.sentence_tokenizer,
                word_tokenizer=self.word_tokenizer,
                lexicon=self.lexicon,
                abbreviation_detector=self.abbreviation_detector,
                pos_tagger=self.pos_tagger,
                ner_tagger=self.ner_tagger,
            )
            return merged
        return NotImplemented


[docs]class Title(Text):

[docs]    def __init__(self, text, **kwargs):
        super(Title, self).__init__(text, **kwargs)
        self.models = []

    def _repr_html_(self):
        return '<h1 class="cde-title">' + self.text + '</h1>'


[docs]class Heading(Text):

[docs]    def __init__(self, text, **kwargs):
        super(Heading, self).__init__(text, **kwargs)
        self.models = []
        # default_parsers = [CompoundHeadingParser(), ChemicalLabelParser()]

    def _repr_html_(self):
        return '<h2 class="cde-title">' + self.text + '</h2>'


[docs]class Paragraph(Text):

[docs]    def __init__(self, text, **kwargs):
        super(Paragraph, self).__init__(text, **kwargs)
        # default_parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(),
        #        TgParser(), ContextParser()]
        self.models = []

    def _repr_html_(self):
        return '<p class="cde-paragraph">' + self.text + '</p>'


[docs]class Footnote(Text):

[docs]    def __init__(self, text, **kwargs):
        super(Footnote, self).__init__(text, **kwargs)
        # default_parsers = [ContextParser(), CaptionContextParser()]
        self.models = []

    def _repr_html_(self):
        return '<p class="cde-footnote">' + self.text + '</p>'


[docs]class Citation(Text):
    taggers = [ChemCrfPosTagger(), NoneTagger(tag_type=NER_TAG_TYPE), NoneTagger(tag_type="dependency"), IndexTagger()]
    abbreviation_detector = None
    subsentence_extractor = NoneSubsentenceExtractor()
    # TODO: Citation parser
    # TODO: Store number/label

    def _repr_html_(self):
        return '<p class="cde-citation">' + self.text + '</p>'


[docs]class Caption(Text):

[docs]    def __init__(self, text, **kwargs):
        super(Caption, self).__init__(text, **kwargs)
        self.models = []
        # default_parsers = [CompoundParser(), ChemicalLabelParser(), CaptionContextParser()]

    def _repr_html_(self):
        return '<caption class="cde-caption">' + self.text + '</caption>'

    @property
    def definitions(self):
        return [definition for sent in self.sentences for definition in sent.definitions]


[docs]class Sentence(BaseText):
    """A single sentence within a text passage."""

    word_tokenizer = BertWordTokenizer()
    lexicon = ChemLexicon()
    abbreviation_detector = ChemAbbreviationDetector()
    subsentence_extractor = SubsentenceExtractor()
    taggers = [ChemCrfPosTagger(), cem_tagger, DependencyTagger()]
    specifier_definition = specifier_definition

[docs]    def __init__(self, text, start=0, end=None, word_tokenizer=None, lexicon=None, abbreviation_detector=None, pos_tagger=None, ner_tagger=None, specifier_definition=None, subsentence_extractor=None, **kwargs):
        """
        .. note::

            If intended as part of a :class:`chemdataextractor.doc.document.Document`,
            an element should either be initialized with a reference to its containing document,
            or its :attr:`document` attribute should be set as soon as possible.
            If the element is being passed in to a :class:`chemdataextractor.doc.document.Document`
            to initialise it, the :attr:`document` attribute is automatically set
            during the initialisation of the document, so the user does not need to worry about this.

        :param str text: The text contained in this element.
        :param int start: (Optional) The starting index of the sentence within the containing element. Default 0.
        :param int end: (Optional) The end index of the sentence within the containing element. Defualt None
        :param WordTokenizer word_tokenizer: (Optional) Word tokenizer for this element.
            Default :class:`~chemdataextractor.nlp.tokenize.ChemWordTokenizer`.
        :param Lexicon lexicon: (Optional) Lexicon for this element. The lexicon stores all the occurences of unique words and can provide
            Brown clusters for the words. Default :class:`~chemdataextractor.nlp.lexicon.ChemLexicon`
        :param AbbreviationDetector abbreviation_detector: (Optional) The abbreviation detector for this element.
            Default :class:`~chemdataextractor.nlp.abbrev.ChemAbbreviationDetector`.
        :param BaseTagger pos_tagger: (Optional) The part of speech tagger for this element.
            Default :class:`~chemdataextractor.nlp.pos.ChemCrfPosTagger`.
        :param BaseTagger ner_tagger: (Optional) The named entity recognition tagger for this element.
            Default :class:`~chemdataextractor.nlp.cem.CemTagger`
        :param Document document: (Optional) The document containing this element.
        :param str label: (Optional) The label for the captioned element, e.g. Table 1 would have a label of 1.
        :param Any id: (Optional) Some identifier for this element. Must be equatable.
        :param list[chemdataextractor.models.BaseModel] models: (Optional) A list of models for this element to parse.
            If the element is part of another element (e.g. a :class:`~chemdataextractor.doc.text.Sentence`
            inside a :class:`~chemdataextractor.doc.text.Paragraph`), or is part of a :class:`~chemdataextractor.doc.document.Document`,
            this is set automatically to be the same as that of the containing element, unless manually set otherwise.
        """
        self.models = []
        super(Sentence, self).__init__(text, word_tokenizer=word_tokenizer, lexicon=lexicon, abbreviation_detector=abbreviation_detector, pos_tagger=pos_tagger, ner_tagger=ner_tagger, **kwargs)
        #: The start index of this sentence within the text passage.
        self.start = start
        #: The end index of this sentence within the text passage.
        self.end = end if end is not None else len(text)
        if specifier_definition is not None:
            self.specifier_definition = specifier_definition
        if subsentence_extractor is not None:
            self.subsentence_extractor = subsentence_extractor

    def __repr__(self):
        return '%s(%r, %r, %r)' % (self.__class__.__name__, self._text, self.start, self.end)

    @memoized_property
    def tokens(self):
        tokens = self.word_tokenizer.get_word_tokens(self)
        for token in tokens:
            token.text = "".join(ch for ch in token.text if unicodedata.category(ch)[0] != "C")
        return tokens

    def _tokens_for_spans(self, spans):
        toks = [RichToken(
            text=self.text[span[0]:span[1]],
            start=span[0] + self.start,
            end=span[1] + self.start,
            lexicon=self.lexicon,
            sentence=self
        ) for span in spans]
        return toks

    @property
    def raw_tokens(self):
        """A list of :class:`str` representations for the tokens in the object."""
        return [token.text for token in self.tokens]

    @memoized_property
    def pos_tagged_tokens(self):
        """A list of (:class:`Token` token, :class:`str` tag) tuples for each sentence in this sentence."""
        # log.debug('Getting pos tags')
        return [(token.text, token.pos_tag) for token in self.tokens]

    @property
    def pos_tags(self):
        """A list of :class:`str` part of speech tags for each sentence in this sentence."""
        return [token[1] for token in self.pos_tagged_tokens]

    @memoized_property
    def unprocessed_ner_tagged_tokens(self):
        """
        A list of (:class:`Token` token, :class:`str` named entity recognition tag)
        from the text.

        No corrections from abbreviation detection are performed.
        """
        # log.debug('Getting unprocessed_ner_tags')
        return [(token.text, token.ner_tag) for token in self.tokens]

    @memoized_property
    def unprocessed_ner_tags(self):
        """
        A list of :class:`str` unprocessed named entity tags for the tokens in this sentence.

        No corrections from abbreviation detection are performed.
        """
        return [token[1] for token in self.unprocessed_ner_tagged_tokens]

    @memoized_property
    def abbreviation_definitions(self):
        """
        A list of all abbreviation definitions in this Document. Each abbreviation is in the form
        (:class:`str` abbreviation, :class:`str` long form of abbreviation, :class:`str` ner_tag)
        """
        abbreviations = []
        if self.abbreviation_detector:
            # log.debug('Detecting abbreviations')
            ners = self.unprocessed_ner_tags
            for abbr_span, long_span in self.abbreviation_detector.detect_spans(self.raw_tokens):
                abbr = self.raw_tokens[abbr_span[0]:abbr_span[1]]
                long = self.raw_tokens[long_span[0]:long_span[1]]
                # Check if long is entirely tagged as one named entity type
                long_tags = ners[long_span[0]:long_span[1]]
                unique_tags = set([tag[2:] for tag in long_tags if tag is not None])
                tag = long_tags[0][2:] if None not in long_tags and len(unique_tags) == 1 else None
                abbreviations.append((abbr, long, tag))
        return abbreviations

    @memoized_property
    def ner_tagged_tokens(self):
        """
        A list of (:class:`Token` token, :class:`str` named entity recognition tag)
        from the sentence.
        """
        return list(zip(self.raw_tokens, self.ner_tags))

    @memoized_property
    def ner_tags(self):
        """
        A list of named entity tags corresponding to each of the tokens in the object.
        For information on what each of the tags can be, check the documentation on
        the specific :attr:`ner_tagger` used for this object.
        """
        # log.debug('Getting ner_tags')
        ner_tags = self.unprocessed_ner_tags
        abbrev_defs = self.document.abbreviation_definitions if self.document else self.abbreviation_definitions
        # Ensure abbreviation entity matches long entity
        # TODO: This is potentially a performance bottleneck?
        for i in range(0, len(ner_tags)):
            for abbr, long, ner_tag in abbrev_defs:
                if abbr == self.raw_tokens[i:i+len(abbr)]:
                    old_ner_tags = ner_tags[i:i+len(abbr)]
                    ner_tags[i] = 'B-%s' % ner_tag if ner_tag is not None else None
                    ner_tags[i+1:i+len(abbr)] = ['I-%s' % ner_tag if ner_tag is not None else None] * (len(abbr) - 1)
                    # Remove ner tags from brackets surrounding abbreviation
                    if i > 1 and self.raw_tokens[i-1] == '(':
                        ner_tags[i-1] = None
                    if i < len(self.raw_tokens) - 1 and self.raw_tokens[i+1] == ')':
                        ner_tags[i+1] = None
                    if not old_ner_tags == ner_tags[i:i+len(abbr)]:
                        log.debug('Correcting abbreviation tag: %s (%s): %s -> %s' % (' '.join(abbr), ' '.join(long), old_ner_tags, ner_tags[i:i+len(abbr)]))
        # TODO: Ensure abbreviations in brackets at the end of an entity match are separated and the brackets untagged
        # Hydrogen Peroxide (H2O2)
        # Tungsten Carbide (WC)
        # TODO: Filter off alphanumerics from end (1h) (3) (I)
        # May need more intelligent
        return ner_tags

    @memoized_property
    def cems(self):
        """
        A list of all Chemical Entity Mentions in this text as :class:`~chemdataextractor.doc.text.Span`
        """
        # log.debug('Getting cems')
        spans = []
        # print(self.text.encode('utf8'))
        for result in chemical_name.scan(self.tokens):
            # parser scan yields (result, startindex, endindex) - we just use the indexes here
            tokens = self.tokens[result[1]:result[2]]
            start = tokens[0].start
            end = tokens[-1].end
            # Adjust boundaries to exclude disallowed prefixes/suffixes
            currenttext = self.text[start-self.start:end-self.start].lower()
            for prefix in IGNORE_PREFIX:
                if currenttext.startswith(prefix):
                    # print('%s removing %s' % (currenttext, prefix))
                    start += len(prefix)
                    break
            for suffix in IGNORE_SUFFIX:
                if currenttext.endswith(suffix):
                    # print('%s removing %s' % (currenttext, suffix))
                    end -= len(suffix)
                    break
            # Adjust boundaries to exclude matching brackets at start and end
            currenttext = self.text[start-self.start:end-self.start]
            for bpair in [('(', ')'), ('[', ']')]:
                if len(currenttext) > 2 and currenttext[0] == bpair[0] and currenttext[-1] == bpair[1]:
                    level = 1
                    for k, char in enumerate(currenttext[1:]):
                        if char == bpair[0]:
                            level += 1
                        elif char == bpair[1]:
                            level -= 1
                        if level == 0 and k == len(currenttext) - 2:
                            start += 1
                            end -= 1
                            break

            # If entity has been reduced to nothing by adjusting boundaries, skip it
            if start >= end:
                continue

            currenttext = self.text[start-self.start:end-self.start]

            # Do splits
            split_spans = []
            comps = list(regex_span_tokenize(currenttext, '(-|\+|\)?-to-\(?|···|/|\s)'))
            if len(comps) > 1:
                for split in SPLITS:
                    if all(re.search(split, currenttext[comp[0]:comp[1]]) for comp in comps):
                        # print('%s splitting %s' % (currenttext, [currenttext[comp[0]:comp[1]] for comp in comps]))
                        for comp in comps:
                            span = Span(text=currenttext[comp[0]:comp[1]], start=start+comp[0], end=start+comp[1])
                            # print('SPLIT: %s - %s' % (currenttext, repr(span)))
                            split_spans.append(span)
                        break
                else:
                    split_spans.append(Span(text=currenttext, start=start, end=end))
            else:
                split_spans.append(Span(text=currenttext, start=start, end=end))

            # Do specials
            for split_span in split_spans:
                for special in SPECIALS:
                    m = re.search(special, split_span.text)
                    if m:
                        # print('%s special %s' % (split_span.text, m.groups()))
                        for i in range(1, len(m.groups()) + 1):
                            span = Span(text=m.group(i), start=split_span.start+m.start(i), end=split_span.start+m.end(i))
                            # print('SUBMATCH: %s - %s' % (currenttext, repr(span)))
                            spans.append(span)
                        break
                else:
                    spans.append(split_span)
        return spans

    @memoized_property
    def definitions(self):
        """
        Return specifier definitions from this sentence

        A definition consists of:
        a) A definition -- The quantitity being defined e.g. "Curie Temperature"
        b) A specifier -- The symbol used to define the quantity e.g. "Tc"
        c) Start -- The index of the starting point of the definition
        d) End -- The index of the end point of the definition

        :return: list -- The specifier definitions
        """
        defs = []
        tokens = self.tokens
        for result in self.specifier_definition.scan(tokens):
            definition = result[0]
            start = result[1]
            end = result[2]
            new_def = {
                       'definition': first(definition.xpath('./phrase/text()')),
                       'specifier': first(definition.xpath('./specifier/text()')),
                       'tokens': tokens[start:end],
                       'start': start,
                       'end': end}
            defs.append(new_def)
        return defs

    @memoized_property
    def chemical_definitions(self):
        """Return a list of chemical entity mentions and their associated label
        """
        cem_defs = []
        tokens = self.tokens
        for result in cem_phrase.scan(tokens):
            tree = result[0]
            start = result[1]
            end = result[2]
            name = first(tree.xpath('./compound/names/text()'))
            label = first(tree.xpath('./compound/labels/text()'))
            if name and label:
                cem_def = {
                    'name': name,
                    'label': label,
                    'start': start,
                    'end': end
                }
                cem_defs.append(cem_def)
        return cem_defs

    @memoized_property
    def tags(self):
        tags = self.pos_tags
        for i, tag in enumerate(self.ner_tags):
            if tag is not None:
                tags[i] = tag
        return tags

    @property
    @deprecated(deprecated_in="2.1", details="Deprecated due to the introduction of RichTokens, and is now just an alias for .tokens.")
    def tagged_tokens(self):
        """
        A list of :class:`~chemdataextractor.doc.text.RichToken` instances found in the text.

        """
        return self.tokens

    def _assign_tags(self, tag_type):
        """
        Assign tags for each token, with some intelligence with regards to which method to use for tagging.
        See :class:`~chemdataextractor.nlp.tag.BaseTagger` and :ref:`this guide<creating_taggers>` for more information.
        """
        for tagger in reversed(self.taggers):
            if tagger.can_tag(tag_type):
                tags = None
                if hasattr(tagger, "batch_tag_for_type") and tagger.can_batch_tag(tag_type) and self.document is not None:
                    self.document._batch_assign_tags(tagger, tag_type)
                elif hasattr(tagger, "tag_for_type"):
                    tags = tagger.tag_for_type(self.tokens, tag_type)
                elif hasattr(tagger, "batch_tag") and self.document is not None:
                    self.document._batch_assign_tags(tagger, tag_type)
                else:
                    if hasattr(tagger, "tag"):
                        tags = tagger.tag(self.tokens)
                    else:
                        tags = tagger.legacy_tag(self.raw_tokens)
                if tags is not None:
                    for index, tag in enumerate(tags):
                        self.tokens[index]._tags[tag_type] = tag[1]
                break

    @property
    def quantity_re(self):
        return construct_quantity_re(*self._streamlined_models)

    @memoized_property
    def subsentences(self):
        subsentence_tokens_list = self.subsentence_extractor.subsentences(self)
        subsentences = []
        for subsentence_tokens in subsentence_tokens_list:
            subsentence = Subsentence(self, subsentence_tokens)
            subsentences.append(subsentence)
        if len(subsentences) == 1:
            subsentences[0]._is_only_subsentence = True
        return subsentences

    @memoized_property
    def full_subsentence(self):
        subsentence_tokens = self.tokens
        return Subsentence(self, subsentence_tokens, is_full_sentence=True)

    @property
    def records(self):
        """All records found in the object, as a list of :class:`~chemdataextractor.model.base.BaseModel`."""
        records = ModelList()
        if len(self.subsentences) != 1:
            records = self.full_subsentence.records

        for subsentence in self.subsentences:
            records.extend(subsentence.records)
        records.remove_subsets()

        i = 0
        length = len(records)
        while i < length:
            j = 0
            while j < length:
                if i != j:
                    records[j].merge_all(records[i])
                j += 1
            i += 1

        cleaned_records = []
        for record in records:
            record._clean(clean_contextual=False)
            if record.noncontextual_required_fulfilled:
                cleaned_records.append(record)
        cleaned_records = ModelList(*cleaned_records)

        if self.document and self.document._should_remove_subrecord_if_merged_in:
            cleaned_records._remove_used_subrecords()

        sorted_records = ModelList(*sorted(cleaned_records, key=lambda el: el.total_confidence() if el.total_confidence() is not None else -10000, reverse=True))
        return sorted_records

    def __add__(self, other):
        if type(self) == type(other):
            merged = self.__class__(
                text=self.text + other.text,
                start=self.start,
                end=None,
                id=self.id or other.id,
                references=self.references + other.references,
                word_tokenizer=self.word_tokenizer,
                lexicon=self.lexicon,
                abbreviation_detector=self.abbreviation_detector,
                pos_tagger=self.pos_tagger,
                ner_tagger=self.ner_tagger,
            )
            return merged
        return NotImplemented


[docs]class Subsentence(Sentence):
    """
    A sub-sentence level logical division of text. Used to store clauses in CDE based on clause extraction as described
    in the paper Automated Construction of a Photocatalysis Dataset for Water-Splitting Applications
    (https://www.nature.com/articles/s41597-023-02511-6).
    An example of subsentences would be “A has quality α” and “A has quality β” from the sentence
    “A has quality α and quality β”. This enables rule-based and template-based parsing to adapt to a wider range
    of sentences.
    """
    tokens = []

[docs]    def __init__(self, parent_sentence, tokens, is_full_sentence=False):
        super().__init__(' '.join([token.text for token in tokens]))
        self.tokens = tokens
        self.models = parent_sentence.models
        self.parent_sentence = parent_sentence
        self.document = self.parent_sentence.document
        self.is_full_sentence = is_full_sentence
        self._is_only_subsentence = False

    @property
    def records(self):
        """All records found in the object, as a list of :class:`~chemdataextractor.model.base.BaseModel`."""
        records = ModelList()
        seen_labels = set()
        skip_parsers = self.document.skip_parsers if self.document is not None else []

        for model in self._streamlined_models:
            for parser in model.parsers:
                if parser in skip_parsers:
                    continue
                if hasattr(parser, 'parse_sentence'):
                    if (parser.parse_full_sentence != self.is_full_sentence) and not self._is_only_subsentence:
                        continue
                    for record in parser.parse_sentence(self):
                        p = record.serialize()
                        if record.is_empty:  # TODO: Potential performance issues?
                            continue
                        # Skip duplicate records
                        if record in records:
                            continue
                        # Skip just labels that have already been seen (bit of a hack)
                        if (type(record) == Compound and 'Compound' in p.keys() and all(k in {'labels', 'roles'} for k in p['Compound'].keys())
                          and set(record.labels).issubset(seen_labels)):
                            continue
                        if type(record) == Compound:
                            seen_labels.update(record.labels)
                            # This could be super slow if we find lots of things
                            found = False
                            for seen_record in records:
                                if (isinstance(seen_record, Compound)
                                  and (not set(record.names).isdisjoint(seen_record.names)
                                       or not set(record.labels).isdisjoint(seen_record.labels))):
                                    seen_record.names = sorted(list(set(seen_record.names).union(record.names)))
                                    seen_record.labels = sorted(list(set(seen_record.labels).union(record.labels)))
                                    seen_record.roles = sorted(list(set(seen_record.roles).union(record.roles)))
                                    found = True
                            if found:
                                continue
                        elif hasattr(record, 'compound') and record.compound is not None:
                            seen_labels.update(record.compound.labels)
                        records.append(record)
        i = 0
        length = len(records)
        while i < length:
            j = 0
            while j < length:
                if i != j:
                    records[j].merge_all(records[i])
                j += 1
            i += 1
        return records


[docs]class Cell(Sentence):
    """Data cell for tables. One row of the category table"""
    # It appears that using different tokenizers/taggers is making the cem recognition worse.
    # This is also consistent with the use of the regular expressions etc we have defined so far.
    # word_tokenizer = FineWordTokenizer()
    # pos_tagger = NoneTagger()
    # ner_tagger = NoneTagger()
    subsentence_extractor = NoneSubsentenceExtractor()


[docs]    def __init__(self, *args, **kwargs):
        super(Cell, self).__init__(*args, **kwargs)
        self.data = None
        self.row_categories = None
        self.col_categories = None
        self.is_tde_cell = False
        self.data_sent = None
        self.row_categories_sents = None
        self.col_categories_sents = None

[docs]    @classmethod
    def from_tdecell(cls, tde_cell, document=None, **kwargs):
        # Have the spacing between the cells contain characters that will never be found
        # so that the system doesn't become confused because it found some number in the heading
        # that it confuses as a power for a unit.
        separator = '🙃🙃🙃🙃'
        cell_separator = ';'
        text = ' '.join([tde_cell[0], separator, cell_separator.join(tde_cell[1]), separator, cell_separator.join(tde_cell[2])])
        cell = cls(text, document=document, **kwargs)
        cell.data = tde_cell[0]
        cell.row_categories = tde_cell[1]
        cell.col_categories = tde_cell[2]
        cell.data_sent = Sentence(cell.data)
        cell.row_categories_sents = [Sentence(cell_text) for cell_text in cell.row_categories]
        cell.col_categories_sents = [Sentence(cell_text) for cell_text in cell.col_categories]
        cell.is_tde_cell = True
        cell.document = document

        # Doing it this way means that a lot of RichTokens are shared between the sub-elements, resulting
        # in tagging being only done once per RichToken, which is faster.
        
        tokens = cell.data_sent.tokens
        span_offset = tokens[-1].end + 1  # a cursor to help getting the span location correct when extending the token list
        separator_token = RichToken(separator, span_offset, span_offset + 4, cls.lexicon, cell)
        span_offset = separator_token.end + 1
        tokens.append(separator_token)

        for row_category_sent in cell.row_categories_sents:
            for token in row_category_sent.tokens:
                token.start = token.start + span_offset
                token.end = token.end + span_offset
                tokens.append(token)
            span_offset = tokens[-1].end + 1
            cell_separator = RichToken(separator, span_offset, span_offset + 4, cls.lexicon, cell)
            tokens.append(cell_separator)

        if cell.row_categories_sents:
            tokens = tokens[:-1]
            span_offset = tokens[-1].end + 1

        separator_token = RichToken(separator, span_offset, span_offset + 4, cls.lexicon, cell)
        span_offset = separator_token.end + 1

        tokens.append(separator_token)
        for col_category_sent in cell.col_categories_sents:
            
            for token in col_category_sent.tokens:
                token.start = token.start + span_offset
                token.end = token.end + span_offset
                tokens.append(token)
            span_offset = tokens[-1].end + 1
            cell_separator = RichToken(separator, span_offset, span_offset + 4, cls.lexicon, cell)
            tokens.append(cell_separator)

        if cell.col_categories_sents:
            tokens = tokens[:-1]

        for token in tokens:
            token.sentence = cell

        cell._tokens = tokens

        return cell

    @memoized_property
    def abbreviation_definitions(self):
        """Empty list. Abbreviation detection is disabled within table cells."""
        return []

    @property
    def records(self):
        """Empty list. Individual cells don't provide records, this is handled by the parent Table."""
        return []

    @property
    def elements(self):
        elements = []
        # if self.data_sent is not None:
        #     elements.append(self.data_sent)
        #     elements.extend(self.row_categories_sents)
        #     elements.extend(self.col_categories_sents)
        return elements


[docs]@python_2_unicode_compatible
class Span(object):
    """A text span within a sentence."""

[docs]    def __init__(self, text, start, end):
        """
        :param str text: The text contained by this span.
        :param int start: The start offset of this token in the original text.
        :param int end: The end offsent of this token in the original text.
        """
        self.text = text
        """The :class:`str` text content of this span."""
        self.start = start
        """The :class:`int` start offset of this token in the original text."""
        self.end = end
        """The :class:`int` end offset of this token in the original text."""

    def __repr__(self):
        return '%s(%r, %r, %r)' % (self.__class__.__name__, self.text, self.start, self.end)

    def __str__(self):
        return self.text

    def __eq__(self, other):
        """Span objects are equal if the source text is equal, and the start and end indices are equal."""
        if not isinstance(other, self.__class__):
            return False
        return self.text == other.text and self.start == other.start and self.end == other.end

    def __ne__(self, other):
        return not self == other

    def __hash__(self):
        return hash((self.text, self.start, self.end))

    @property
    def length(self):
        """The :class:`int` offset length of this span in the original text."""
        return self.end - self.start


[docs]class Token(Span):
    """A single token within a sentence. Corresponds to a word, character, punctuation etc."""

[docs]    def __init__(self, text, start, end, lexicon):
        """
        :param str text: The text contained by this token.
        :param int start: The start offset of this token in the original text.
        :param int end: The end offset of this token in the original text.
        :param Lexicon lexicon: The lexicon which contains this token.
        """
        super(Token, self).__init__(text, start, end)
        #: The lexicon for this token.
        self.lexicon = lexicon
        self.lexicon.add(text)

    @property
    def lex(self):
        """The corresponding :class:`chemdataextractor.nlp.lexicon.Lexeme` entry in the Lexicon for this token."""
        return self.lexicon[self.text]


[docs]class RichToken(Token):
    """
    :class:`~chemdataextractor.doc.text.RichToken` provides a flexible way to store properties related to tokens.
    :class:`~chemdataextractor.doc.text.RichToken` instances hold a reference to the parent sentence they come from, and if the user
    desires a certain tag, the parent sentence is called and its taggers used to tag
    the sentence on demand. This structure means that tokens are tagged *if and only if*
    the user requires them. These tags are then cached by the :class:`~chemdataextractor.doc.text.RichToken` so that any single token
    is only ever tagged once.

    Such tags can be accessed either via dot syntax (:python:`token.ner_tag`) or
    via dictionary syntax (:python:`token['ner_tag']`). To maintain compatibility with
    the return value for :meth:`~chemdataextractor.doc.text.Sentence.tagged_tokens` from previous
    versions of ChemDataExtractor, the keys of :python:`0` and :python:`1` are reserved for the
    text of the token and the combined NER and PoS tags, respectively. Furthermore, any properties
    included in the :class:`~chemdataextractor.doc.text.Token` class are reserved as well.

    .. note::

        By default, ChemDataExtractor provides, and assumes that calling :python:`.ner_tag` and
        :python:`.pos_tag` on a :class:`~chemdataextractor.doc.text.RichToken`
        will not fail, which should be taken into account when setting the :python:`taggers` property on any
        :class:`~chemdataextractor.doc.text.BaseText` subclasses.
    """

[docs]    def __init__(self, text, start, end, lexicon, sentence):
        super(RichToken, self).__init__(text, start, end, lexicon)
        self.sentence = sentence
        self._tags = {}

    @classmethod
    def _from_token(cls, token, sentence):
        rich_token = cls(text=token.text,
                         start=token.start,
                         end=token.end,
                         lexicon=token.lexicon,
                         sentence=sentence)
        return rich_token

    @property
    def legacy_pos_tag(self):
        pos_tag = self[POS_TAG_TYPE]
        ner_tag = self[NER_TAG_TYPE]
        if ner_tag is not None and ner_tag != "O":
            return ner_tag
        else:
            return pos_tag

    def __getitem__(self, key):
        if key == 0:
            return self.text
        elif key == 1:
            return self.legacy_pos_tag
        elif isinstance(key, str):
            return self.__getattr__(key)
        else:
            raise IndexError("Key" + str(key) + " is out of bounds for this token.")

    def __getattr__(self, name):
        if name in self._tags.keys():
            return self._tags[name]
        else:
            self.sentence._assign_tags(name)
            if name not in self._tags.keys():
                raise AttributeError(name + " is not a supported tag type for the sentence: " + str(self.sentence) + str(self.sentence.taggers) + str(type(self.sentence))
                                     + str(self._tags) + str(self))
            return self._tags[name]