# -*- coding: utf-8 -*-
"""
Text-based document elements.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from abc import abstractproperty
import collections
import logging
import re
from pprint import pprint
from deprecation import deprecated
import unicodedata
import six
from ..model.base import ModelList, sort_merge_candidates
from ..nlp.lexicon import ChemLexicon, Lexicon
from ..nlp.cem import IGNORE_PREFIX, IGNORE_SUFFIX, SPECIALS, SPLITS, CiDictCemTagger, CsDictCemTagger, CrfCemTagger
from ..nlp.new_cem import CemTagger
from ..nlp.abbrev import ChemAbbreviationDetector
from ..nlp.tag import NoneTagger, POS_TAG_TYPE, NER_TAG_TYPE
from ..nlp.pos import ChemCrfPosTagger, CrfPosTagger, ApPosTagger, ChemApPosTagger
# from ..nlp.tokenize import ChemSentenceTokenizer, ChemWordTokenizer, regex_span_tokenize, SentenceTokenizer, WordTokenizer, FineWordTokenizer, ChemTokWordTokenizer, SpacyTokenizer
from ..nlp.tokenize import BertWordTokenizer, ChemSentenceTokenizer, regex_span_tokenize, SentenceTokenizer, WordTokenizer
from ..nlp.subsentence import SubsentenceExtractor, NoneSubsentenceExtractor
from ..nlp.dependency import DependencyTagger, IndexTagger
from ..text import CONTROL_RE
from ..utils import memoized_property, python_2_unicode_compatible, first
from .element import BaseElement
from ..parse.definitions import specifier_definition
from ..parse.cem import chemical_name, cem_phrase
from ..parse.quantity import construct_quantity_re
from ..model.model import Compound, NmrSpectrum, IrSpectrum, UvvisSpectrum, MeltingPoint, GlassTransition
from ..model.contextual_range import SentenceRange
log = logging.getLogger(__name__)
cem_tagger = CemTagger()
[docs]@python_2_unicode_compatible
class BaseText(BaseElement):
"""Abstract base class for a text Document Element."""
taggers = []
"""
A list of :class:`~chemdataextractor.nlp.tag.BaseTagger` instances. This is a list of taggers
that will be called by ChemDataExtractor to assign tags to each of the tokens in this element.
"""
[docs] def __init__(self, text, word_tokenizer=None, lexicon=None, abbreviation_detector=None, pos_tagger=None, ner_tagger=None, taggers=None, **kwargs):
"""
.. note::
If intended as part of a :class:`~chemdataextractor.doc.document.Document`,
an element should either be initialized with a reference to its containing document,
or its :attr:`document` attribute should be set as soon as possible.
If the element is being passed in to a :class:`~chemdataextractor.doc.document.Document`
to initialise it, the :attr:`document` attribute is automatically set
during the initialisation of the document, so the user does not need to worry about this.
:param str text: The text contained in this element.
:param WordTokenizer word_tokenizer: (Optional) Word tokenizer for this element.
:param Lexicon lexicon: (Optional) Lexicon for this element. The lexicon stores all the occurences of unique words and can provide
Brown clusters for the words.
:param AbbreviationDetector abbreviation_detector: (Optional) The abbreviation detector for this element.
:param BaseTagger pos_tagger: (Optional) The part of speech tagger for this element.
:param BaseTagger ner_tagger: (Optional) The named entity recognition tagger for this element.
:param Document document: (Optional) The document containing this element.
:param str label: (Optional) The label for the captioned element, e.g. Table 1 would have a label of 1.
:param Any id: (Optional) Some identifier for this element. Must be equatable.
:param list[chemdataextractor.models.BaseModel] models: (Optional) A list of models for this element to parse.
If the element is part of another element (e.g. a :class:`~chemdataextractor.doc.text.Sentence`
inside a :class:`~chemdataextractor.doc.text.Paragraph`), or is part of a :class:`~chemdataextractor.doc.document.Document`,
this is set automatically to be the same as that of the containing element, unless manually set otherwise.
"""
if not isinstance(text, six.text_type):
raise TypeError('Text must be a unicode string')
super(BaseText, self).__init__(**kwargs)
self._text = text
self.word_tokenizer = word_tokenizer if word_tokenizer is not None else self.word_tokenizer
self.lexicon = lexicon if lexicon is not None else self.lexicon
self.abbreviation_detector = abbreviation_detector if abbreviation_detector is not None else self.abbreviation_detector
if taggers is not None and len(taggers) != 0:
self.taggers = taggers
def __repr__(self):
return '%s(id=%r, references=%r, text=%r)' % (self.__class__.__name__, self.id, self.references, self._text)
def __str__(self):
return self._text
@property
def text(self):
"""The raw text :class:`str` for this passage of text."""
return self._text
@abstractproperty
def word_tokenizer(self):
"""The :class:`~chemdataextractor.nlp.tokenize.WordTokenizer` used by this element."""
return
@abstractproperty
def lexicon(self):
"""The :class:`~chemdataextractor.nlp.lexicon.Lexicon` used by this element."""
return
@property
def pos_tagger(self):
"""The part of speech tagger used by this element. A subclass of :class:`~chemdataextractor.nlp.tag.BaseTagger`"""
for tagger in reversed(self.taggers):
if tagger.can_tag(POS_TAG_TYPE):
return tagger
@pos_tagger.setter
@deprecated(deprecated_in="2.1", details="Deprecated due to the introduction of taggers API. Consult the migration guide for more information.")
def pos_tagger(self, value):
if not value.can_tag(POS_TAG_TYPE):
value.tag_type = POS_TAG_TYPE
self.taggers.append(value)
@property
def ner_tagger(self):
"""The named entity recognition tagger used by this element. A subclass of :class:`~chemdataextractor.nlp.tag.BaseTagger`"""
for tagger in reversed(self.taggers):
if tagger.can_tag(NER_TAG_TYPE):
return tagger
@ner_tagger.setter
@deprecated(deprecated_in="2.1", details="Deprecated due to the introduction of taggers API. Consult the migration guide for more information.")
def ner_tagger(self, value):
if not value.can_tag(NER_TAG_TYPE):
value.tag_type = NER_TAG_TYPE
self.taggers.append(value)
@abstractproperty
def tokens(self):
"""A list of :class:`RichToken` s for this object."""
return
@abstractproperty
def tags(self):
"""
A list of tags corresponding to each of the tokens in the object.
For information on what each of the tags can be, check the documentation on
the specific :attr:`ner_tagger` and :attr:`pos_tagger` used for this class.
"""
return
@abstractproperty
def definitions(self):
"""
A list of all specifier definitions
"""
return
@abstractproperty
def chemical_definitions(self):
"""A list of all chemical label definitiond
"""
return
[docs] def serialize(self):
"""
Convert self to a dictionary. The key 'type' will contain
the name of the class being serialized, and the key 'content' will contain
a serialized representation of :attr:`text`, which is a :class:`str`
"""
data = {'type': self.__class__.__name__, 'content': self.text}
return data
def _repr_html_(self):
return self.text
[docs]class Text(collections.Sequence, BaseText):
"""A passage of text, comprising one or more sentences."""
sentence_tokenizer = ChemSentenceTokenizer()
word_tokenizer = BertWordTokenizer()
lexicon = ChemLexicon()
abbreviation_detector = ChemAbbreviationDetector()
taggers = [ChemCrfPosTagger(), cem_tagger, DependencyTagger()]
subsentence_extractor = None
[docs] def __init__(self, text, sentence_tokenizer=None, word_tokenizer=None, lexicon=None, abbreviation_detector=None, pos_tagger=None, ner_tagger=None, parsers=None, **kwargs):
"""
.. note::
If intended as part of a :class:`~chemdataextractor.doc.document.Document`,
an element should either be initialized with a reference to its containing document,
or its :attr:`document` attribute should be set as soon as possible.
If the element is being passed in to a :class:`~chemdataextractor.doc.document.Document`
to initialise it, the :attr:`document` attribute is automatically set
during the initialisation of the document, so the user does not need to worry about this.
:param str text: The text contained in this element.
:param SentenceTokenizer sentence_tokenizer: (Optional) Sentence tokenizer for this element.
Default :class:`~chemdataextractor.nlp.tokenize.ChemSentenceTokenizer`.
:param WordTokenizer word_tokenizer: (Optional) Word tokenizer for this element.
Default :class:`~chemdataextractor.nlp.tokenize.ChemWordTokenizer`.
:param Lexicon lexicon: (Optional) Lexicon for this element. The lexicon stores all the occurences of unique words and can provide
Brown clusters for the words. Default :class:`~chemdataextractor.nlp.lexicon.ChemLexicon`
:param AbbreviationDetector abbreviation_detector: (Optional) The abbreviation detector for this element.
Default :class:`~chemdataextractor.nlp.abbrev.ChemAbbreviationDetector`.
:param BaseTagger pos_tagger: (Optional) The part of speech tagger for this element.
Default :class:`~chemdataextractor.nlp.pos.ChemCrfPosTagger`.
:param BaseTagger ner_tagger: (Optional) The named entity recognition tagger for this element.
Default :class:`~chemdataextractor.nlp.cem.CemTagger`
:param Document document: (Optional) The document containing this element.
:param str label: (Optional) The label for the captioned element, e.g. Table 1 would have a label of 1.
:param Any id: (Optional) Some identifier for this element. Must be equatable.
:param list[chemdataextractor.models.BaseModel] models: (Optional) A list of models for this element to parse.
If the element is part of another element (e.g. a :class:`~chemdataextractor.doc.text.Sentence`
inside a :class:`~chemdataextractor.doc.text.Paragraph`), or is part of a :class:`~chemdataextractor.doc.document.Document`,
this is set automatically to be the same as that of the containing element, unless manually set otherwise.
"""
super(Text, self).__init__(text, word_tokenizer=word_tokenizer, lexicon=lexicon, abbreviation_detector=abbreviation_detector, pos_tagger=pos_tagger, ner_tagger=ner_tagger, parsers=None, **kwargs)
self.sentence_tokenizer = sentence_tokenizer if sentence_tokenizer is not None else self.sentence_tokenizer
def __getitem__(self, index):
return self.sentences[index]
def __len__(self):
return len(self.sentences)
[docs] def set_config(self):
""" Load settings from configuration file
.. note:: Called when Document instance is created
"""
if self.document is None:
pass
else:
c = self.document.config
if 'SENTENCE_TOKENIZER' in c.keys():
self.sentence_tokenizer = eval(c['SENTENCE_TOKENIZER'])()
if 'WORD_TOKENIZER' in c.keys():
self.word_tokenizer = eval(c['WORD_TOKENIZER'])()
if 'POS_TAGGER' in c.keys():
self.pos_tagger = eval(c['POS_TAGGER'])()
if 'NER_TAGGER' in c.keys():
self.ner_tagger = eval(c['NER_TAGGER'])()
if 'LEXICON' in c.keys():
self.lexicon = eval(c['LEXICON'])()
if 'PARSERS' in c.keys():
raise(DeprecationWarning('Manually setting parsers deprecated, any settings from config files for this will be ignored.'))
@memoized_property
def sentences(self):
"""A list of :class:`Sentence` s that make up this text passage."""
sents = self.sentence_tokenizer.get_sentences(self)
for sent in sents:
sent.document = self.document
return sents
@property
def elements(self):
return self.sentences
def _sentences_from_spans(self, spans):
sents = []
for span in spans:
sent = Sentence(
text=self.text[span[0]:span[1]],
start=span[0],
end=span[1],
word_tokenizer=self.word_tokenizer,
lexicon=self.lexicon,
abbreviation_detector=self.abbreviation_detector,
pos_tagger=self.pos_tagger,
ner_tagger=self.ner_tagger,
document=self.document,
models=self.models,
taggers=self.taggers,
subsentence_extractor=self.subsentence_extractor,
)
sents.append(sent)
return sents
@property
def raw_sentences(self):
"""A list of :class:`str` for the sentences that make up this text passage."""
return [sentence.text for sentence in self.sentences]
@property
def tokens(self):
return [sent.tokens for sent in self.sentences]
@property
def raw_tokens(self):
"""A list of :class:`str` representations for the tokens of each sentence in this text passage."""
return [sent.raw_tokens for sent in self.sentences]
@property
def pos_tagged_tokens(self):
"""A list of (:class:`Token` token, :class:`str` tag) tuples for each sentence in this text passage."""
return [sent.pos_tagged_tokens for sent in self.sentences]
@property
def pos_tags(self):
"""A list of :class:`str` part of speech tags for each sentence in this text passage."""
return [sent.pos_tags for sent in self.sentences]
@property
def unprocessed_ner_tagged_tokens(self):
"""
A list of (:class:`Token` token, :class:`str` named entity recognition tag)
from the text.
No corrections from abbreviation detection are performed.
"""
return [sent.unprocessed_ner_tagged_tokens for sent in self.sentences]
@property
def unprocessed_ner_tags(self):
"""
A list of :class:`str` unprocessed named entity tags for the tokens in this sentence.
No corrections from abbreviation detection are performed.
"""
return [sent.unprocessed_ner_tags for sent in self.sentences]
@property
def ner_tagged_tokens(self):
"""
A list of (:class:`Token` token, :class:`str` named entity recognition tag)
from the text.
"""
return [sent.ner_tagged_tokens for sent in self.sentences]
@property
def ner_tags(self):
"""
A list of named entity tags corresponding to each of the tokens in the object.
For information on what each of the tags can be, check the documentation on
the specific :attr:`ner_tagger` used for this object.
"""
return [sent.ner_tags for sent in self.sentences]
@property
def cems(self):
"""
A list of all Chemical Entity Mentions in this text as :class:`chemdataextractor.doc.text.span`
"""
return [cem for sent in self.sentences for cem in sent.cems]
@property
def definitions(self):
"""
Return a list of tagged definitions for each sentence in this text passage
"""
return [definition for sent in self.sentences for definition in sent.definitions]
@property
def chemical_definitions(self):
"""
Return a list of tagged definitions for each sentence in this text passage
"""
return [definition for sent in self.sentences for definition in sent.chemical_definitions]
@property
@deprecated(deprecated_in="2.1", details="Deprecated due to the introduction of RichTokens, and is now just an alias for .tokens.")
def tagged_tokens(self):
"""
A list of lists of :class:`~chemdataextractor.doc.text.RichToken` instances found in the text.
"""
return [sent.tagged_tokens for sent in self.sentences]
@property
def tags(self):
return [sent.tags for sent in self.sentences]
@property
def abbreviation_definitions(self):
"""
A list of all abbreviation definitions in this Document. Each abbreviation is in the form
(:class:`str` abbreviation, :class:`str` long form of abbreviation, :class:`str` ner_tag)
"""
return [ab for sent in self.sentences for ab in sent.abbreviation_definitions]
@property
def records(self):
"""All records found in the object, as a list of :class:`~chemdataextractor.model.base.BaseModel`."""
records_by_sentence = [sent.records for sent in self.sentences]
num_sentences = len(records_by_sentence)
for index, sent_records in enumerate(records_by_sentence):
offset = 1
max_offset = max(num_sentences - index, index)
merge_candidates = []
while offset <= max_offset:
backwards_index = index - offset
forwards_index = index + offset
distance = offset * SentenceRange()
merge_candidates.extend([(distance, record) for record in records_by_sentence[backwards_index]])
if backwards_index >= 0:
merge_candidates.extend((distance, record) for record in records_by_sentence[backwards_index])
if forwards_index < num_sentences:
merge_candidates.extend((distance, record) for record in records_by_sentence[forwards_index])
offset += 1
self._resolve_contextual(sent_records, sort_merge_candidates(merge_candidates))
# Don't sort these records as this encodes where they were found in the paragraph
records = ModelList(*[r for sentence_records in records_by_sentence for r in sentence_records])
records.remove_subsets()
return records
def _resolve_contextual(self, parent_records, child_records):
for parent_record in parent_records:
for distance, child_record in child_records:
parent_record.merge_contextual(child_record, distance=distance)
def __add__(self, other):
if type(self) == type(other):
merged = self.__class__(
text=self.text + other.text,
id=self.id or other.id,
references=self.references + other.references,
sentence_tokenizer=self.sentence_tokenizer,
word_tokenizer=self.word_tokenizer,
lexicon=self.lexicon,
abbreviation_detector=self.abbreviation_detector,
pos_tagger=self.pos_tagger,
ner_tagger=self.ner_tagger,
)
return merged
return NotImplemented
[docs]class Title(Text):
[docs] def __init__(self, text, **kwargs):
super(Title, self).__init__(text, **kwargs)
self.models = []
def _repr_html_(self):
return '<h1 class="cde-title">' + self.text + '</h1>'
[docs]class Heading(Text):
[docs] def __init__(self, text, **kwargs):
super(Heading, self).__init__(text, **kwargs)
self.models = []
# default_parsers = [CompoundHeadingParser(), ChemicalLabelParser()]
def _repr_html_(self):
return '<h2 class="cde-title">' + self.text + '</h2>'
[docs]class Paragraph(Text):
[docs] def __init__(self, text, **kwargs):
super(Paragraph, self).__init__(text, **kwargs)
# default_parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(),
# TgParser(), ContextParser()]
self.models = []
def _repr_html_(self):
return '<p class="cde-paragraph">' + self.text + '</p>'
[docs]class Citation(Text):
taggers = [ChemCrfPosTagger(), NoneTagger(tag_type=NER_TAG_TYPE), NoneTagger(tag_type="dependency"), IndexTagger()]
abbreviation_detector = None
subsentence_extractor = NoneSubsentenceExtractor()
# TODO: Citation parser
# TODO: Store number/label
def _repr_html_(self):
return '<p class="cde-citation">' + self.text + '</p>'
[docs]class Caption(Text):
[docs] def __init__(self, text, **kwargs):
super(Caption, self).__init__(text, **kwargs)
self.models = []
# default_parsers = [CompoundParser(), ChemicalLabelParser(), CaptionContextParser()]
def _repr_html_(self):
return '<caption class="cde-caption">' + self.text + '</caption>'
@property
def definitions(self):
return [definition for sent in self.sentences for definition in sent.definitions]
[docs]class Sentence(BaseText):
"""A single sentence within a text passage."""
word_tokenizer = BertWordTokenizer()
lexicon = ChemLexicon()
abbreviation_detector = ChemAbbreviationDetector()
subsentence_extractor = SubsentenceExtractor()
taggers = [ChemCrfPosTagger(), cem_tagger, DependencyTagger()]
specifier_definition = specifier_definition
[docs] def __init__(self, text, start=0, end=None, word_tokenizer=None, lexicon=None, abbreviation_detector=None, pos_tagger=None, ner_tagger=None, specifier_definition=None, subsentence_extractor=None, **kwargs):
"""
.. note::
If intended as part of a :class:`chemdataextractor.doc.document.Document`,
an element should either be initialized with a reference to its containing document,
or its :attr:`document` attribute should be set as soon as possible.
If the element is being passed in to a :class:`chemdataextractor.doc.document.Document`
to initialise it, the :attr:`document` attribute is automatically set
during the initialisation of the document, so the user does not need to worry about this.
:param str text: The text contained in this element.
:param int start: (Optional) The starting index of the sentence within the containing element. Default 0.
:param int end: (Optional) The end index of the sentence within the containing element. Defualt None
:param WordTokenizer word_tokenizer: (Optional) Word tokenizer for this element.
Default :class:`~chemdataextractor.nlp.tokenize.ChemWordTokenizer`.
:param Lexicon lexicon: (Optional) Lexicon for this element. The lexicon stores all the occurences of unique words and can provide
Brown clusters for the words. Default :class:`~chemdataextractor.nlp.lexicon.ChemLexicon`
:param AbbreviationDetector abbreviation_detector: (Optional) The abbreviation detector for this element.
Default :class:`~chemdataextractor.nlp.abbrev.ChemAbbreviationDetector`.
:param BaseTagger pos_tagger: (Optional) The part of speech tagger for this element.
Default :class:`~chemdataextractor.nlp.pos.ChemCrfPosTagger`.
:param BaseTagger ner_tagger: (Optional) The named entity recognition tagger for this element.
Default :class:`~chemdataextractor.nlp.cem.CemTagger`
:param Document document: (Optional) The document containing this element.
:param str label: (Optional) The label for the captioned element, e.g. Table 1 would have a label of 1.
:param Any id: (Optional) Some identifier for this element. Must be equatable.
:param list[chemdataextractor.models.BaseModel] models: (Optional) A list of models for this element to parse.
If the element is part of another element (e.g. a :class:`~chemdataextractor.doc.text.Sentence`
inside a :class:`~chemdataextractor.doc.text.Paragraph`), or is part of a :class:`~chemdataextractor.doc.document.Document`,
this is set automatically to be the same as that of the containing element, unless manually set otherwise.
"""
self.models = []
super(Sentence, self).__init__(text, word_tokenizer=word_tokenizer, lexicon=lexicon, abbreviation_detector=abbreviation_detector, pos_tagger=pos_tagger, ner_tagger=ner_tagger, **kwargs)
#: The start index of this sentence within the text passage.
self.start = start
#: The end index of this sentence within the text passage.
self.end = end if end is not None else len(text)
if specifier_definition is not None:
self.specifier_definition = specifier_definition
if subsentence_extractor is not None:
self.subsentence_extractor = subsentence_extractor
def __repr__(self):
return '%s(%r, %r, %r)' % (self.__class__.__name__, self._text, self.start, self.end)
@memoized_property
def tokens(self):
tokens = self.word_tokenizer.get_word_tokens(self)
for token in tokens:
token.text = "".join(ch for ch in token.text if unicodedata.category(ch)[0] != "C")
return tokens
def _tokens_for_spans(self, spans):
toks = [RichToken(
text=self.text[span[0]:span[1]],
start=span[0] + self.start,
end=span[1] + self.start,
lexicon=self.lexicon,
sentence=self
) for span in spans]
return toks
@property
def raw_tokens(self):
"""A list of :class:`str` representations for the tokens in the object."""
return [token.text for token in self.tokens]
@memoized_property
def pos_tagged_tokens(self):
"""A list of (:class:`Token` token, :class:`str` tag) tuples for each sentence in this sentence."""
# log.debug('Getting pos tags')
return [(token.text, token.pos_tag) for token in self.tokens]
@property
def pos_tags(self):
"""A list of :class:`str` part of speech tags for each sentence in this sentence."""
return [token[1] for token in self.pos_tagged_tokens]
@memoized_property
def unprocessed_ner_tagged_tokens(self):
"""
A list of (:class:`Token` token, :class:`str` named entity recognition tag)
from the text.
No corrections from abbreviation detection are performed.
"""
# log.debug('Getting unprocessed_ner_tags')
return [(token.text, token.ner_tag) for token in self.tokens]
@memoized_property
def unprocessed_ner_tags(self):
"""
A list of :class:`str` unprocessed named entity tags for the tokens in this sentence.
No corrections from abbreviation detection are performed.
"""
return [token[1] for token in self.unprocessed_ner_tagged_tokens]
@memoized_property
def abbreviation_definitions(self):
"""
A list of all abbreviation definitions in this Document. Each abbreviation is in the form
(:class:`str` abbreviation, :class:`str` long form of abbreviation, :class:`str` ner_tag)
"""
abbreviations = []
if self.abbreviation_detector:
# log.debug('Detecting abbreviations')
ners = self.unprocessed_ner_tags
for abbr_span, long_span in self.abbreviation_detector.detect_spans(self.raw_tokens):
abbr = self.raw_tokens[abbr_span[0]:abbr_span[1]]
long = self.raw_tokens[long_span[0]:long_span[1]]
# Check if long is entirely tagged as one named entity type
long_tags = ners[long_span[0]:long_span[1]]
unique_tags = set([tag[2:] for tag in long_tags if tag is not None])
tag = long_tags[0][2:] if None not in long_tags and len(unique_tags) == 1 else None
abbreviations.append((abbr, long, tag))
return abbreviations
@memoized_property
def ner_tagged_tokens(self):
"""
A list of (:class:`Token` token, :class:`str` named entity recognition tag)
from the sentence.
"""
return list(zip(self.raw_tokens, self.ner_tags))
@memoized_property
def ner_tags(self):
"""
A list of named entity tags corresponding to each of the tokens in the object.
For information on what each of the tags can be, check the documentation on
the specific :attr:`ner_tagger` used for this object.
"""
# log.debug('Getting ner_tags')
ner_tags = self.unprocessed_ner_tags
abbrev_defs = self.document.abbreviation_definitions if self.document else self.abbreviation_definitions
# Ensure abbreviation entity matches long entity
# TODO: This is potentially a performance bottleneck?
for i in range(0, len(ner_tags)):
for abbr, long, ner_tag in abbrev_defs:
if abbr == self.raw_tokens[i:i+len(abbr)]:
old_ner_tags = ner_tags[i:i+len(abbr)]
ner_tags[i] = 'B-%s' % ner_tag if ner_tag is not None else None
ner_tags[i+1:i+len(abbr)] = ['I-%s' % ner_tag if ner_tag is not None else None] * (len(abbr) - 1)
# Remove ner tags from brackets surrounding abbreviation
if i > 1 and self.raw_tokens[i-1] == '(':
ner_tags[i-1] = None
if i < len(self.raw_tokens) - 1 and self.raw_tokens[i+1] == ')':
ner_tags[i+1] = None
if not old_ner_tags == ner_tags[i:i+len(abbr)]:
log.debug('Correcting abbreviation tag: %s (%s): %s -> %s' % (' '.join(abbr), ' '.join(long), old_ner_tags, ner_tags[i:i+len(abbr)]))
# TODO: Ensure abbreviations in brackets at the end of an entity match are separated and the brackets untagged
# Hydrogen Peroxide (H2O2)
# Tungsten Carbide (WC)
# TODO: Filter off alphanumerics from end (1h) (3) (I)
# May need more intelligent
return ner_tags
@memoized_property
def cems(self):
"""
A list of all Chemical Entity Mentions in this text as :class:`~chemdataextractor.doc.text.Span`
"""
# log.debug('Getting cems')
spans = []
# print(self.text.encode('utf8'))
for result in chemical_name.scan(self.tokens):
# parser scan yields (result, startindex, endindex) - we just use the indexes here
tokens = self.tokens[result[1]:result[2]]
start = tokens[0].start
end = tokens[-1].end
# Adjust boundaries to exclude disallowed prefixes/suffixes
currenttext = self.text[start-self.start:end-self.start].lower()
for prefix in IGNORE_PREFIX:
if currenttext.startswith(prefix):
# print('%s removing %s' % (currenttext, prefix))
start += len(prefix)
break
for suffix in IGNORE_SUFFIX:
if currenttext.endswith(suffix):
# print('%s removing %s' % (currenttext, suffix))
end -= len(suffix)
break
# Adjust boundaries to exclude matching brackets at start and end
currenttext = self.text[start-self.start:end-self.start]
for bpair in [('(', ')'), ('[', ']')]:
if len(currenttext) > 2 and currenttext[0] == bpair[0] and currenttext[-1] == bpair[1]:
level = 1
for k, char in enumerate(currenttext[1:]):
if char == bpair[0]:
level += 1
elif char == bpair[1]:
level -= 1
if level == 0 and k == len(currenttext) - 2:
start += 1
end -= 1
break
# If entity has been reduced to nothing by adjusting boundaries, skip it
if start >= end:
continue
currenttext = self.text[start-self.start:end-self.start]
# Do splits
split_spans = []
comps = list(regex_span_tokenize(currenttext, '(-|\+|\)?-to-\(?|···|/|\s)'))
if len(comps) > 1:
for split in SPLITS:
if all(re.search(split, currenttext[comp[0]:comp[1]]) for comp in comps):
# print('%s splitting %s' % (currenttext, [currenttext[comp[0]:comp[1]] for comp in comps]))
for comp in comps:
span = Span(text=currenttext[comp[0]:comp[1]], start=start+comp[0], end=start+comp[1])
# print('SPLIT: %s - %s' % (currenttext, repr(span)))
split_spans.append(span)
break
else:
split_spans.append(Span(text=currenttext, start=start, end=end))
else:
split_spans.append(Span(text=currenttext, start=start, end=end))
# Do specials
for split_span in split_spans:
for special in SPECIALS:
m = re.search(special, split_span.text)
if m:
# print('%s special %s' % (split_span.text, m.groups()))
for i in range(1, len(m.groups()) + 1):
span = Span(text=m.group(i), start=split_span.start+m.start(i), end=split_span.start+m.end(i))
# print('SUBMATCH: %s - %s' % (currenttext, repr(span)))
spans.append(span)
break
else:
spans.append(split_span)
return spans
@memoized_property
def definitions(self):
"""
Return specifier definitions from this sentence
A definition consists of:
a) A definition -- The quantitity being defined e.g. "Curie Temperature"
b) A specifier -- The symbol used to define the quantity e.g. "Tc"
c) Start -- The index of the starting point of the definition
d) End -- The index of the end point of the definition
:return: list -- The specifier definitions
"""
defs = []
tokens = self.tokens
for result in self.specifier_definition.scan(tokens):
definition = result[0]
start = result[1]
end = result[2]
new_def = {
'definition': first(definition.xpath('./phrase/text()')),
'specifier': first(definition.xpath('./specifier/text()')),
'tokens': tokens[start:end],
'start': start,
'end': end}
defs.append(new_def)
return defs
@memoized_property
def chemical_definitions(self):
"""Return a list of chemical entity mentions and their associated label
"""
cem_defs = []
tokens = self.tokens
for result in cem_phrase.scan(tokens):
tree = result[0]
start = result[1]
end = result[2]
name = first(tree.xpath('./compound/names/text()'))
label = first(tree.xpath('./compound/labels/text()'))
if name and label:
cem_def = {
'name': name,
'label': label,
'start': start,
'end': end
}
cem_defs.append(cem_def)
return cem_defs
@memoized_property
def tags(self):
tags = self.pos_tags
for i, tag in enumerate(self.ner_tags):
if tag is not None:
tags[i] = tag
return tags
@property
@deprecated(deprecated_in="2.1", details="Deprecated due to the introduction of RichTokens, and is now just an alias for .tokens.")
def tagged_tokens(self):
"""
A list of :class:`~chemdataextractor.doc.text.RichToken` instances found in the text.
"""
return self.tokens
def _assign_tags(self, tag_type):
"""
Assign tags for each token, with some intelligence with regards to which method to use for tagging.
See :class:`~chemdataextractor.nlp.tag.BaseTagger` and :ref:`this guide<creating_taggers>` for more information.
"""
for tagger in reversed(self.taggers):
if tagger.can_tag(tag_type):
tags = None
if hasattr(tagger, "batch_tag_for_type") and tagger.can_batch_tag(tag_type) and self.document is not None:
self.document._batch_assign_tags(tagger, tag_type)
elif hasattr(tagger, "tag_for_type"):
tags = tagger.tag_for_type(self.tokens, tag_type)
elif hasattr(tagger, "batch_tag") and self.document is not None:
self.document._batch_assign_tags(tagger, tag_type)
else:
if hasattr(tagger, "tag"):
tags = tagger.tag(self.tokens)
else:
tags = tagger.legacy_tag(self.raw_tokens)
if tags is not None:
for index, tag in enumerate(tags):
self.tokens[index]._tags[tag_type] = tag[1]
break
@property
def quantity_re(self):
return construct_quantity_re(*self._streamlined_models)
@memoized_property
def subsentences(self):
subsentence_tokens_list = self.subsentence_extractor.subsentences(self)
subsentences = []
for subsentence_tokens in subsentence_tokens_list:
subsentence = Subsentence(self, subsentence_tokens)
subsentences.append(subsentence)
if len(subsentences) == 1:
subsentences[0]._is_only_subsentence = True
return subsentences
@memoized_property
def full_subsentence(self):
subsentence_tokens = self.tokens
return Subsentence(self, subsentence_tokens, is_full_sentence=True)
@property
def records(self):
"""All records found in the object, as a list of :class:`~chemdataextractor.model.base.BaseModel`."""
records = ModelList()
if len(self.subsentences) != 1:
records = self.full_subsentence.records
for subsentence in self.subsentences:
records.extend(subsentence.records)
records.remove_subsets()
i = 0
length = len(records)
while i < length:
j = 0
while j < length:
if i != j:
records[j].merge_all(records[i])
j += 1
i += 1
cleaned_records = []
for record in records:
record._clean(clean_contextual=False)
if record.noncontextual_required_fulfilled:
cleaned_records.append(record)
cleaned_records = ModelList(*cleaned_records)
if self.document and self.document._should_remove_subrecord_if_merged_in:
cleaned_records._remove_used_subrecords()
sorted_records = ModelList(*sorted(cleaned_records, key=lambda el: el.total_confidence() if el.total_confidence() is not None else -10000, reverse=True))
return sorted_records
def __add__(self, other):
if type(self) == type(other):
merged = self.__class__(
text=self.text + other.text,
start=self.start,
end=None,
id=self.id or other.id,
references=self.references + other.references,
word_tokenizer=self.word_tokenizer,
lexicon=self.lexicon,
abbreviation_detector=self.abbreviation_detector,
pos_tagger=self.pos_tagger,
ner_tagger=self.ner_tagger,
)
return merged
return NotImplemented
[docs]class Subsentence(Sentence):
"""
A sub-sentence level logical division of text. Used to store clauses in CDE based on clause extraction as described
in the paper Automated Construction of a Photocatalysis Dataset for Water-Splitting Applications
(https://www.nature.com/articles/s41597-023-02511-6).
An example of subsentences would be “A has quality α” and “A has quality β” from the sentence
“A has quality α and quality β”. This enables rule-based and template-based parsing to adapt to a wider range
of sentences.
"""
tokens = []
[docs] def __init__(self, parent_sentence, tokens, is_full_sentence=False):
super().__init__(' '.join([token.text for token in tokens]))
self.tokens = tokens
self.models = parent_sentence.models
self.parent_sentence = parent_sentence
self.document = self.parent_sentence.document
self.is_full_sentence = is_full_sentence
self._is_only_subsentence = False
@property
def records(self):
"""All records found in the object, as a list of :class:`~chemdataextractor.model.base.BaseModel`."""
records = ModelList()
seen_labels = set()
skip_parsers = self.document.skip_parsers if self.document is not None else []
for model in self._streamlined_models:
for parser in model.parsers:
if parser in skip_parsers:
continue
if hasattr(parser, 'parse_sentence'):
if (parser.parse_full_sentence != self.is_full_sentence) and not self._is_only_subsentence:
continue
for record in parser.parse_sentence(self):
p = record.serialize()
if record.is_empty: # TODO: Potential performance issues?
continue
# Skip duplicate records
if record in records:
continue
# Skip just labels that have already been seen (bit of a hack)
if (type(record) == Compound and 'Compound' in p.keys() and all(k in {'labels', 'roles'} for k in p['Compound'].keys())
and set(record.labels).issubset(seen_labels)):
continue
if type(record) == Compound:
seen_labels.update(record.labels)
# This could be super slow if we find lots of things
found = False
for seen_record in records:
if (isinstance(seen_record, Compound)
and (not set(record.names).isdisjoint(seen_record.names)
or not set(record.labels).isdisjoint(seen_record.labels))):
seen_record.names = sorted(list(set(seen_record.names).union(record.names)))
seen_record.labels = sorted(list(set(seen_record.labels).union(record.labels)))
seen_record.roles = sorted(list(set(seen_record.roles).union(record.roles)))
found = True
if found:
continue
elif hasattr(record, 'compound') and record.compound is not None:
seen_labels.update(record.compound.labels)
records.append(record)
i = 0
length = len(records)
while i < length:
j = 0
while j < length:
if i != j:
records[j].merge_all(records[i])
j += 1
i += 1
return records
[docs]class Cell(Sentence):
"""Data cell for tables. One row of the category table"""
# It appears that using different tokenizers/taggers is making the cem recognition worse.
# This is also consistent with the use of the regular expressions etc we have defined so far.
# word_tokenizer = FineWordTokenizer()
# pos_tagger = NoneTagger()
# ner_tagger = NoneTagger()
subsentence_extractor = NoneSubsentenceExtractor()
[docs] def __init__(self, *args, **kwargs):
super(Cell, self).__init__(*args, **kwargs)
self.data = None
self.row_categories = None
self.col_categories = None
self.is_tde_cell = False
self.data_sent = None
self.row_categories_sents = None
self.col_categories_sents = None
[docs] @classmethod
def from_tdecell(cls, tde_cell, document=None, **kwargs):
# Have the spacing between the cells contain characters that will never be found
# so that the system doesn't become confused because it found some number in the heading
# that it confuses as a power for a unit.
separator = '🙃🙃🙃🙃'
cell_separator = ';'
text = ' '.join([tde_cell[0], separator, cell_separator.join(tde_cell[1]), separator, cell_separator.join(tde_cell[2])])
cell = cls(text, document=document, **kwargs)
cell.data = tde_cell[0]
cell.row_categories = tde_cell[1]
cell.col_categories = tde_cell[2]
cell.data_sent = Sentence(cell.data)
cell.row_categories_sents = [Sentence(cell_text) for cell_text in cell.row_categories]
cell.col_categories_sents = [Sentence(cell_text) for cell_text in cell.col_categories]
cell.is_tde_cell = True
cell.document = document
# Doing it this way means that a lot of RichTokens are shared between the sub-elements, resulting
# in tagging being only done once per RichToken, which is faster.
tokens = cell.data_sent.tokens
span_offset = tokens[-1].end + 1 # a cursor to help getting the span location correct when extending the token list
separator_token = RichToken(separator, span_offset, span_offset + 4, cls.lexicon, cell)
span_offset = separator_token.end + 1
tokens.append(separator_token)
for row_category_sent in cell.row_categories_sents:
for token in row_category_sent.tokens:
token.start = token.start + span_offset
token.end = token.end + span_offset
tokens.append(token)
span_offset = tokens[-1].end + 1
cell_separator = RichToken(separator, span_offset, span_offset + 4, cls.lexicon, cell)
tokens.append(cell_separator)
if cell.row_categories_sents:
tokens = tokens[:-1]
span_offset = tokens[-1].end + 1
separator_token = RichToken(separator, span_offset, span_offset + 4, cls.lexicon, cell)
span_offset = separator_token.end + 1
tokens.append(separator_token)
for col_category_sent in cell.col_categories_sents:
for token in col_category_sent.tokens:
token.start = token.start + span_offset
token.end = token.end + span_offset
tokens.append(token)
span_offset = tokens[-1].end + 1
cell_separator = RichToken(separator, span_offset, span_offset + 4, cls.lexicon, cell)
tokens.append(cell_separator)
if cell.col_categories_sents:
tokens = tokens[:-1]
for token in tokens:
token.sentence = cell
cell._tokens = tokens
return cell
@memoized_property
def abbreviation_definitions(self):
"""Empty list. Abbreviation detection is disabled within table cells."""
return []
@property
def records(self):
"""Empty list. Individual cells don't provide records, this is handled by the parent Table."""
return []
@property
def elements(self):
elements = []
# if self.data_sent is not None:
# elements.append(self.data_sent)
# elements.extend(self.row_categories_sents)
# elements.extend(self.col_categories_sents)
return elements
[docs]@python_2_unicode_compatible
class Span(object):
"""A text span within a sentence."""
[docs] def __init__(self, text, start, end):
"""
:param str text: The text contained by this span.
:param int start: The start offset of this token in the original text.
:param int end: The end offsent of this token in the original text.
"""
self.text = text
"""The :class:`str` text content of this span."""
self.start = start
"""The :class:`int` start offset of this token in the original text."""
self.end = end
"""The :class:`int` end offset of this token in the original text."""
def __repr__(self):
return '%s(%r, %r, %r)' % (self.__class__.__name__, self.text, self.start, self.end)
def __str__(self):
return self.text
def __eq__(self, other):
"""Span objects are equal if the source text is equal, and the start and end indices are equal."""
if not isinstance(other, self.__class__):
return False
return self.text == other.text and self.start == other.start and self.end == other.end
def __ne__(self, other):
return not self == other
def __hash__(self):
return hash((self.text, self.start, self.end))
@property
def length(self):
"""The :class:`int` offset length of this span in the original text."""
return self.end - self.start
[docs]class Token(Span):
"""A single token within a sentence. Corresponds to a word, character, punctuation etc."""
[docs] def __init__(self, text, start, end, lexicon):
"""
:param str text: The text contained by this token.
:param int start: The start offset of this token in the original text.
:param int end: The end offset of this token in the original text.
:param Lexicon lexicon: The lexicon which contains this token.
"""
super(Token, self).__init__(text, start, end)
#: The lexicon for this token.
self.lexicon = lexicon
self.lexicon.add(text)
@property
def lex(self):
"""The corresponding :class:`chemdataextractor.nlp.lexicon.Lexeme` entry in the Lexicon for this token."""
return self.lexicon[self.text]
[docs]class RichToken(Token):
"""
:class:`~chemdataextractor.doc.text.RichToken` provides a flexible way to store properties related to tokens.
:class:`~chemdataextractor.doc.text.RichToken` instances hold a reference to the parent sentence they come from, and if the user
desires a certain tag, the parent sentence is called and its taggers used to tag
the sentence on demand. This structure means that tokens are tagged *if and only if*
the user requires them. These tags are then cached by the :class:`~chemdataextractor.doc.text.RichToken` so that any single token
is only ever tagged once.
Such tags can be accessed either via dot syntax (:python:`token.ner_tag`) or
via dictionary syntax (:python:`token['ner_tag']`). To maintain compatibility with
the return value for :meth:`~chemdataextractor.doc.text.Sentence.tagged_tokens` from previous
versions of ChemDataExtractor, the keys of :python:`0` and :python:`1` are reserved for the
text of the token and the combined NER and PoS tags, respectively. Furthermore, any properties
included in the :class:`~chemdataextractor.doc.text.Token` class are reserved as well.
.. note::
By default, ChemDataExtractor provides, and assumes that calling :python:`.ner_tag` and
:python:`.pos_tag` on a :class:`~chemdataextractor.doc.text.RichToken`
will not fail, which should be taken into account when setting the :python:`taggers` property on any
:class:`~chemdataextractor.doc.text.BaseText` subclasses.
"""
[docs] def __init__(self, text, start, end, lexicon, sentence):
super(RichToken, self).__init__(text, start, end, lexicon)
self.sentence = sentence
self._tags = {}
@classmethod
def _from_token(cls, token, sentence):
rich_token = cls(text=token.text,
start=token.start,
end=token.end,
lexicon=token.lexicon,
sentence=sentence)
return rich_token
@property
def legacy_pos_tag(self):
pos_tag = self[POS_TAG_TYPE]
ner_tag = self[NER_TAG_TYPE]
if ner_tag is not None and ner_tag != "O":
return ner_tag
else:
return pos_tag
def __getitem__(self, key):
if key == 0:
return self.text
elif key == 1:
return self.legacy_pos_tag
elif isinstance(key, str):
return self.__getattr__(key)
else:
raise IndexError("Key" + str(key) + " is out of bounds for this token.")
def __getattr__(self, name):
if name in self._tags.keys():
return self._tags[name]
else:
self.sentence._assign_tags(name)
if name not in self._tags.keys():
raise AttributeError(name + " is not a supported tag type for the sentence: " + str(self.sentence) + str(self.sentence.taggers) + str(type(self.sentence))
+ str(self._tags) + str(self))
return self._tags[name]