# -*- coding: utf-8 -*-
"""
Text-based document elements.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from abc import abstractproperty
import collections
import logging
import re
from pprint import pprint
import six
from ..model.base import ModelList
from ..nlp.lexicon import ChemLexicon, Lexicon
from ..nlp.cem import CemTagger, IGNORE_PREFIX, IGNORE_SUFFIX, SPECIALS, SPLITS, CiDictCemTagger, CsDictCemTagger, CrfCemTagger
from ..nlp.abbrev import ChemAbbreviationDetector
from ..nlp.tag import NoneTagger
from ..nlp.pos import ChemCrfPosTagger, CrfPosTagger, ApPosTagger, ChemApPosTagger
from ..nlp.tokenize import ChemSentenceTokenizer, ChemWordTokenizer, regex_span_tokenize, SentenceTokenizer, WordTokenizer, FineWordTokenizer
from ..text import CONTROL_RE
from ..utils import memoized_property, python_2_unicode_compatible, first
from .element import BaseElement
from ..parse.definitions import specifier_definition
from ..parse.cem import chemical_name, cem_phrase
from ..parse.quantity import construct_quantity_re
from ..model.model import Compound, NmrSpectrum, IrSpectrum, UvvisSpectrum, MeltingPoint, GlassTransition
log = logging.getLogger(__name__)
[docs]@python_2_unicode_compatible
class BaseText(BaseElement):
"""Abstract base class for a text Document Element."""
[docs] def __init__(self, text, word_tokenizer=None, lexicon=None, abbreviation_detector=None, pos_tagger=None, ner_tagger=None, **kwargs):
"""
.. note::
If intended as part of a :class:`~chemdataextractor.doc.document.Document`,
an element should either be initialized with a reference to its containing document,
or its :attr:`document` attribute should be set as soon as possible.
If the element is being passed in to a :class:`~chemdataextractor.doc.document.Document`
to initialise it, the :attr:`document` attribute is automatically set
during the initialisation of the document, so the user does not need to worry about this.
:param str text: The text contained in this element.
:param WordTokenizer word_tokenizer: (Optional) Word tokenizer for this element.
:param Lexicon lexicon: (Optional) Lexicon for this element. The lexicon stores all the occurences of unique words and can provide
Brown clusters for the words.
:param AbbreviationDetector abbreviation_detector: (Optional) The abbreviation detector for this element.
:param BaseTagger pos_tagger: (Optional) The part of speech tagger for this element.
:param BaseTagger ner_tagger: (Optional) The named entity recognition tagger for this element.
:param Document document: (Optional) The document containing this element.
:param str label: (Optional) The label for the captioned element, e.g. Table 1 would have a label of 1.
:param Any id: (Optional) Some identifier for this element. Must be equatable.
:param list[chemdataextractor.models.BaseModel] models: (Optional) A list of models for this element to parse.
If the element is part of another element (e.g. a :class:`~chemdataextractor.doc.text.Sentence`
inside a :class:`~chemdataextractor.doc.text.Paragraph`), or is part of a :class:`~chemdataextractor.doc.document.Document`,
this is set automatically to be the same as that of the containing element, unless manually set otherwise.
"""
if not isinstance(text, six.text_type):
raise TypeError('Text must be a unicode string')
super(BaseText, self).__init__(**kwargs)
self._text = text
self.word_tokenizer = word_tokenizer if word_tokenizer is not None else self.word_tokenizer
self.lexicon = lexicon if lexicon is not None else self.lexicon
self.abbreviation_detector = abbreviation_detector if abbreviation_detector is not None else self.abbreviation_detector
self.pos_tagger = pos_tagger if pos_tagger is not None else self.pos_tagger
self.ner_tagger = ner_tagger if ner_tagger is not None else self.ner_tagger
def __repr__(self):
return '%s(id=%r, references=%r, text=%r)' % (self.__class__.__name__, self.id, self.references, self._text)
def __str__(self):
return self._text
@property
def text(self):
"""The raw text :class:`str` for this passage of text."""
return self._text
@abstractproperty
def word_tokenizer(self):
"""The :class:`~chemdataextractor.nlp.tokenize.WordTokenizer` used by this element."""
return
@abstractproperty
def lexicon(self):
"""The :class:`~chemdataextractor.nlp.lexicon.Lexicon` used by this element."""
return
@abstractproperty
def pos_tagger(self):
"""The part of speech tagger used by this element. A subclass of :class:`~chemdataextractor.nlp.tag.BaseTagger`"""
return
@abstractproperty
def ner_tagger(self):
"""The named entity recognition tagger used by this element. A subclass of :class:`~chemdataextractor.nlp.tag.BaseTagger`"""
return
@abstractproperty
def tokens(self):
"""A list of :class:`Token` s for this object."""
return
@abstractproperty
def tags(self):
"""
A list of tags corresponding to each of the tokens in the object.
For information on what each of the tags can be, check the documentation on
the specific :attr:`ner_tagger` and :attr:`pos_tagger` used for this class.
"""
return
@abstractproperty
def definitions(self):
"""
A list of all specifier definitions
"""
return
@abstractproperty
def chemical_definitions(self):
"""A list of all chemical label definitiond
"""
return
[docs] def serialize(self):
"""
Convert self to a dictionary. The key 'type' will contain
the name of the class being serialized, and the key 'content' will contain
a serialized representation of :attr:`text`, which is a :class:`str`
"""
data = {'type': self.__class__.__name__, 'content': self.text}
return data
def _repr_html_(self):
return self.text
[docs]class Text(collections.Sequence, BaseText):
"""A passage of text, comprising one or more sentences."""
sentence_tokenizer = ChemSentenceTokenizer()
word_tokenizer = ChemWordTokenizer()
lexicon = ChemLexicon()
abbreviation_detector = ChemAbbreviationDetector()
pos_tagger = ChemCrfPosTagger() # ChemPerceptronTagger()
ner_tagger = CemTagger()
[docs] def __init__(self, text, sentence_tokenizer=None, word_tokenizer=None, lexicon=None, abbreviation_detector=None, pos_tagger=None, ner_tagger=None, parsers=None, **kwargs):
"""
.. note::
If intended as part of a :class:`~chemdataextractor.doc.document.Document`,
an element should either be initialized with a reference to its containing document,
or its :attr:`document` attribute should be set as soon as possible.
If the element is being passed in to a :class:`~chemdataextractor.doc.document.Document`
to initialise it, the :attr:`document` attribute is automatically set
during the initialisation of the document, so the user does not need to worry about this.
:param str text: The text contained in this element.
:param SentenceTokenizer sentence_tokenizer: (Optional) Sentence tokenizer for this element.
Default :class:`~chemdataextractor.nlp.tokenize.ChemSentenceTokenizer`.
:param WordTokenizer word_tokenizer: (Optional) Word tokenizer for this element.
Default :class:`~chemdataextractor.nlp.tokenize.ChemWordTokenizer`.
:param Lexicon lexicon: (Optional) Lexicon for this element. The lexicon stores all the occurences of unique words and can provide
Brown clusters for the words. Default :class:`~chemdataextractor.nlp.lexicon.ChemLexicon`
:param AbbreviationDetector abbreviation_detector: (Optional) The abbreviation detector for this element.
Default :class:`~chemdataextractor.nlp.abbrev.ChemAbbreviationDetector`.
:param BaseTagger pos_tagger: (Optional) The part of speech tagger for this element.
Default :class:`~chemdataextractor.nlp.pos.ChemCrfPosTagger`.
:param BaseTagger ner_tagger: (Optional) The named entity recognition tagger for this element.
Default :class:`~chemdataextractor.nlp.cem.CemTagger`
:param Document document: (Optional) The document containing this element.
:param str label: (Optional) The label for the captioned element, e.g. Table 1 would have a label of 1.
:param Any id: (Optional) Some identifier for this element. Must be equatable.
:param list[chemdataextractor.models.BaseModel] models: (Optional) A list of models for this element to parse.
If the element is part of another element (e.g. a :class:`~chemdataextractor.doc.text.Sentence`
inside a :class:`~chemdataextractor.doc.text.Paragraph`), or is part of a :class:`~chemdataextractor.doc.document.Document`,
this is set automatically to be the same as that of the containing element, unless manually set otherwise.
"""
super(Text, self).__init__(text, word_tokenizer=word_tokenizer, lexicon=lexicon, abbreviation_detector=abbreviation_detector, pos_tagger=pos_tagger, ner_tagger=ner_tagger, parsers=None, **kwargs)
self.sentence_tokenizer = sentence_tokenizer if sentence_tokenizer is not None else self.sentence_tokenizer
def __getitem__(self, index):
return self.sentences[index]
def __len__(self):
return len(self.sentences)
[docs] def set_config(self):
""" Load settings from configuration file
.. note:: Called when Document instance is created
"""
if self.document is None:
pass
else:
c = self.document.config
if 'SENTENCE_TOKENIZER' in c.keys():
self.sentence_tokenizer = eval(c['SENTENCE_TOKENIZER'])()
if 'WORD_TOKENIZER' in c.keys():
self.word_tokenizer = eval(c['WORD_TOKENIZER'])()
if 'POS_TAGGER' in c.keys():
self.pos_tagger = eval(c['POS_TAGGER'])()
if 'NER_TAGGER' in c.keys():
self.ner_tagger = eval(c['NER_TAGGER'])()
if 'LEXICON' in c.keys():
self.lexicon = eval(c['LEXICON'])()
if 'PARSERS' in c.keys():
raise(DeprecationWarning('Manually setting parsers deprecated, any settings from config files for this will be ignored.'))
@memoized_property
def sentences(self):
"""A list of :class:`Sentence` s that make up this text passage."""
return self.sentence_tokenizer.get_sentences(self)
def _sentences_from_spans(self, spans):
sents = []
for span in spans:
sent = Sentence(
text=self.text[span[0]:span[1]],
start=span[0],
end=span[1],
word_tokenizer=self.word_tokenizer,
lexicon=self.lexicon,
abbreviation_detector=self.abbreviation_detector,
pos_tagger=self.pos_tagger,
ner_tagger=self.ner_tagger,
document=self.document,
models=self.models
)
sents.append(sent)
return sents
@property
def raw_sentences(self):
"""A list of :class:`str` for the sentences that make up this text passage."""
return [sentence.text for sentence in self.sentences]
@property
def tokens(self):
return [sent.tokens for sent in self.sentences]
@property
def raw_tokens(self):
"""A list of :class:`str` representations for the tokens of each sentence in this text passage."""
return [sent.raw_tokens for sent in self.sentences]
@property
def pos_tagged_tokens(self):
"""A list of (:class:`Token` token, :class:`str` tag) tuples for each sentence in this text passage."""
return [sent.pos_tagged_tokens for sent in self.sentences]
@property
def pos_tags(self):
"""A list of :class:`str` part of speech tags for each sentence in this text passage."""
return [sent.pos_tags for sent in self.sentences]
@property
def unprocessed_ner_tagged_tokens(self):
"""
A list of (:class:`Token` token, :class:`str` named entity recognition tag)
from the text.
No corrections from abbreviation detection are performed.
"""
return [sent.unprocessed_ner_tagged_tokens for sent in self.sentences]
@property
def unprocessed_ner_tags(self):
"""
A list of :class:`str` unprocessed named entity tags for the tokens in this sentence.
No corrections from abbreviation detection are performed.
"""
return [sent.unprocessed_ner_tags for sent in self.sentences]
@property
def ner_tagged_tokens(self):
"""
A list of (:class:`Token` token, :class:`str` named entity recognition tag)
from the text.
"""
return [sent.ner_tagged_tokens for sent in self.sentences]
@property
def ner_tags(self):
"""
A list of named entity tags corresponding to each of the tokens in the object.
For information on what each of the tags can be, check the documentation on
the specific :attr:`ner_tagger` used for this object.
"""
return [sent.ner_tags for sent in self.sentences]
@property
def cems(self):
"""
A list of all Chemical Entity Mentions in this text as :class:`chemdataextractor.doc.text.span`
"""
return [cem for sent in self.sentences for cem in sent.cems]
@property
def definitions(self):
"""
Return a list of tagged definitions for each sentence in this text passage
"""
return [definition for sent in self.sentences for definition in sent.definitions]
@property
def chemical_definitions(self):
"""
Return a list of tagged definitions for each sentence in this text passage
"""
return [definition for sent in self.sentences for definition in sent.chemical_definitions]
@property
def tagged_tokens(self):
"""
A list of (:class:`Token` token, :class:`str` named entity recognition tag)
from the text.
"""
return [sent.tagged_tokens for sent in self.sentences]
@property
def tags(self):
return [sent.tags for sent in self.sentences]
@property
def abbreviation_definitions(self):
"""
A list of all abbreviation definitions in this Document. Each abbreviation is in the form
(:class:`str` abbreviation, :class:`str` long form of abbreviation, :class:`str` ner_tag)
"""
return [ab for sent in self.sentences for ab in sent.abbreviation_definitions]
@property
def records(self):
"""All records found in the object, as a list of :class:`~chemdataextractor.model.base.BaseModel`."""
return ModelList(*[r for sent in self.sentences for r in sent.records])
def __add__(self, other):
if type(self) == type(other):
merged = self.__class__(
text=self.text + other.text,
id=self.id or other.id,
references=self.references + other.references,
sentence_tokenizer=self.sentence_tokenizer,
word_tokenizer=self.word_tokenizer,
lexicon=self.lexicon,
abbreviation_detector=self.abbreviation_detector,
pos_tagger=self.pos_tagger,
ner_tagger=self.ner_tagger,
)
return merged
return NotImplemented
[docs]class Title(Text):
[docs] def __init__(self, text, **kwargs):
super(Title, self).__init__(text, **kwargs)
self.models = []
def _repr_html_(self):
return '<h1 class="cde-title">' + self.text + '</h1>'
[docs]class Heading(Text):
[docs] def __init__(self, text, **kwargs):
super(Heading, self).__init__(text, **kwargs)
self.models = []
# default_parsers = [CompoundHeadingParser(), ChemicalLabelParser()]
def _repr_html_(self):
return '<h2 class="cde-title">' + self.text + '</h2>'
[docs]class Paragraph(Text):
[docs] def __init__(self, text, **kwargs):
super(Paragraph, self).__init__(text, **kwargs)
# default_parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(),
# TgParser(), ContextParser()]
self.models = []
def _repr_html_(self):
return '<p class="cde-paragraph">' + self.text + '</p>'
[docs]class Citation(Text):
ner_tagger = NoneTagger() #: No tagging is done for citations
abbreviation_detector = None
# TODO: Citation parser
# TODO: Store number/label
def _repr_html_(self):
return '<p class="cde-citation">' + self.text + '</p>'
[docs]class Caption(Text):
[docs] def __init__(self, text, **kwargs):
super(Caption, self).__init__(text, **kwargs)
self.models = []
# default_parsers = [CompoundParser(), ChemicalLabelParser(), CaptionContextParser()]
def _repr_html_(self):
return '<caption class="cde-caption">' + self.text + '</caption>'
@property
def definitions(self):
return [definition for sent in self.sentences for definition in sent.definitions]
[docs]class Sentence(BaseText):
"""A single sentence within a text passage."""
word_tokenizer = ChemWordTokenizer()
lexicon = ChemLexicon()
abbreviation_detector = ChemAbbreviationDetector()
pos_tagger = ChemCrfPosTagger() # ChemPerceptronTagger()
ner_tagger = CemTagger()
[docs] def __init__(self, text, start=0, end=None, word_tokenizer=None, lexicon=None, abbreviation_detector=None, pos_tagger=None, ner_tagger=None, **kwargs):
"""
.. note::
If intended as part of a :class:`chemdataextractor.doc.document.Document`,
an element should either be initialized with a reference to its containing document,
or its :attr:`document` attribute should be set as soon as possible.
If the element is being passed in to a :class:`chemdataextractor.doc.document.Document`
to initialise it, the :attr:`document` attribute is automatically set
during the initialisation of the document, so the user does not need to worry about this.
:param str text: The text contained in this element.
:param int start: (Optional) The starting index of the sentence within the containing element. Default 0.
:param int end: (Optional) The end index of the sentence within the containing element. Defualt None
:param WordTokenizer word_tokenizer: (Optional) Word tokenizer for this element.
Default :class:`~chemdataextractor.nlp.tokenize.ChemWordTokenizer`.
:param Lexicon lexicon: (Optional) Lexicon for this element. The lexicon stores all the occurences of unique words and can provide
Brown clusters for the words. Default :class:`~chemdataextractor.nlp.lexicon.ChemLexicon`
:param AbbreviationDetector abbreviation_detector: (Optional) The abbreviation detector for this element.
Default :class:`~chemdataextractor.nlp.abbrev.ChemAbbreviationDetector`.
:param BaseTagger pos_tagger: (Optional) The part of speech tagger for this element.
Default :class:`~chemdataextractor.nlp.pos.ChemCrfPosTagger`.
:param BaseTagger ner_tagger: (Optional) The named entity recognition tagger for this element.
Default :class:`~chemdataextractor.nlp.cem.CemTagger`
:param Document document: (Optional) The document containing this element.
:param str label: (Optional) The label for the captioned element, e.g. Table 1 would have a label of 1.
:param Any id: (Optional) Some identifier for this element. Must be equatable.
:param list[chemdataextractor.models.BaseModel] models: (Optional) A list of models for this element to parse.
If the element is part of another element (e.g. a :class:`~chemdataextractor.doc.text.Sentence`
inside a :class:`~chemdataextractor.doc.text.Paragraph`), or is part of a :class:`~chemdataextractor.doc.document.Document`,
this is set automatically to be the same as that of the containing element, unless manually set otherwise.
"""
self.models = []
super(Sentence, self).__init__(text, word_tokenizer=word_tokenizer, lexicon=lexicon, abbreviation_detector=abbreviation_detector, pos_tagger=pos_tagger, ner_tagger=ner_tagger, **kwargs)
#: The start index of this sentence within the text passage.
self.start = start
#: The end index of this sentence within the text passage.
self.end = end if end is not None else len(text)
def __repr__(self):
return '%s(%r, %r, %r)' % (self.__class__.__name__, self._text, self.start, self.end)
@memoized_property
def tokens(self):
return self.word_tokenizer.get_word_tokens(self)
def _tokens_for_spans(self, spans):
toks = [Token(
text=self.text[span[0]:span[1]],
start=span[0] + self.start,
end=span[1] + self.start,
lexicon=self.lexicon
) for span in spans]
return toks
@property
def raw_tokens(self):
"""A list of :class:`str` representations for the tokens in the object."""
return [token.text for token in self.tokens]
@memoized_property
def pos_tagged_tokens(self):
"""A list of (:class:`Token` token, :class:`str` tag) tuples for each sentence in this sentence."""
# log.debug('Getting pos tags')
return self.pos_tagger.tag(self.raw_tokens)
@property
def pos_tags(self):
"""A list of :class:`str` part of speech tags for each sentence in this sentence."""
return [tag for token, tag in self.pos_tagged_tokens]
@memoized_property
def unprocessed_ner_tagged_tokens(self):
"""
A list of (:class:`Token` token, :class:`str` named entity recognition tag)
from the text.
No corrections from abbreviation detection are performed.
"""
# log.debug('Getting unprocessed_ner_tags')
return self.ner_tagger.tag(self.pos_tagged_tokens)
@memoized_property
def unprocessed_ner_tags(self):
"""
A list of :class:`str` unprocessed named entity tags for the tokens in this sentence.
No corrections from abbreviation detection are performed.
"""
return [tag for token, tag in self.unprocessed_ner_tagged_tokens]
@memoized_property
def abbreviation_definitions(self):
"""
A list of all abbreviation definitions in this Document. Each abbreviation is in the form
(:class:`str` abbreviation, :class:`str` long form of abbreviation, :class:`str` ner_tag)
"""
abbreviations = []
if self.abbreviation_detector:
# log.debug('Detecting abbreviations')
ners = self.unprocessed_ner_tags
for abbr_span, long_span in self.abbreviation_detector.detect_spans(self.raw_tokens):
abbr = self.raw_tokens[abbr_span[0]:abbr_span[1]]
long = self.raw_tokens[long_span[0]:long_span[1]]
# Check if long is entirely tagged as one named entity type
long_tags = ners[long_span[0]:long_span[1]]
unique_tags = set([tag[2:] for tag in long_tags if tag is not None])
tag = long_tags[0][2:] if None not in long_tags and len(unique_tags) == 1 else None
abbreviations.append((abbr, long, tag))
return abbreviations
@memoized_property
def ner_tagged_tokens(self):
"""
A list of (:class:`Token` token, :class:`str` named entity recognition tag)
from the sentence.
"""
return list(zip(self.raw_tokens, self.ner_tags))
@memoized_property
def ner_tags(self):
"""
A list of named entity tags corresponding to each of the tokens in the object.
For information on what each of the tags can be, check the documentation on
the specific :attr:`ner_tagger` used for this object.
"""
# log.debug('Getting ner_tags')
ner_tags = self.unprocessed_ner_tags
abbrev_defs = self.document.abbreviation_definitions if self.document else self.abbreviation_definitions
# Ensure abbreviation entity matches long entity
# TODO: This is potentially a performance bottleneck?
for i in range(0, len(ner_tags)):
for abbr, long, ner_tag in abbrev_defs:
if abbr == self.raw_tokens[i:i+len(abbr)]:
old_ner_tags = ner_tags[i:i+len(abbr)]
ner_tags[i] = 'B-%s' % ner_tag if ner_tag is not None else None
ner_tags[i+1:i+len(abbr)] = ['I-%s' % ner_tag if ner_tag is not None else None] * (len(abbr) - 1)
# Remove ner tags from brackets surrounding abbreviation
if i > 1 and self.raw_tokens[i-1] == '(':
ner_tags[i-1] = None
if i < len(self.raw_tokens) - 1 and self.raw_tokens[i+1] == ')':
ner_tags[i+1] = None
if not old_ner_tags == ner_tags[i:i+len(abbr)]:
log.debug('Correcting abbreviation tag: %s (%s): %s -> %s' % (' '.join(abbr), ' '.join(long), old_ner_tags, ner_tags[i:i+len(abbr)]))
# TODO: Ensure abbreviations in brackets at the end of an entity match are separated and the brackets untagged
# Hydrogen Peroxide (H2O2)
# Tungsten Carbide (WC)
# TODO: Filter off alphanumerics from end (1h) (3) (I)
# May need more intelligent
return ner_tags
@memoized_property
def cems(self):
"""
A list of all Chemical Entity Mentions in this text as :class:`~chemdataextractor.doc.text.Span`
"""
# log.debug('Getting cems')
spans = []
# print(self.text.encode('utf8'))
for result in chemical_name.scan(self.tagged_tokens):
# parser scan yields (result, startindex, endindex) - we just use the indexes here
tokens = self.tokens[result[1]:result[2]]
start = tokens[0].start
end = tokens[-1].end
# Adjust boundaries to exclude disallowed prefixes/suffixes
currenttext = self.text[start-self.start:end-self.start].lower()
for prefix in IGNORE_PREFIX:
if currenttext.startswith(prefix):
# print('%s removing %s' % (currenttext, prefix))
start += len(prefix)
break
for suffix in IGNORE_SUFFIX:
if currenttext.endswith(suffix):
# print('%s removing %s' % (currenttext, suffix))
end -= len(suffix)
break
# Adjust boundaries to exclude matching brackets at start and end
currenttext = self.text[start-self.start:end-self.start]
for bpair in [('(', ')'), ('[', ']')]:
if len(currenttext) > 2 and currenttext[0] == bpair[0] and currenttext[-1] == bpair[1]:
level = 1
for k, char in enumerate(currenttext[1:]):
if char == bpair[0]:
level += 1
elif char == bpair[1]:
level -= 1
if level == 0 and k == len(currenttext) - 2:
start += 1
end -= 1
break
# If entity has been reduced to nothing by adjusting boundaries, skip it
if start >= end:
continue
currenttext = self.text[start-self.start:end-self.start]
# Do splits
split_spans = []
comps = list(regex_span_tokenize(currenttext, '(-|\+|\)?-to-\(?|ยทยทยท|/|\s)'))
if len(comps) > 1:
for split in SPLITS:
if all(re.search(split, currenttext[comp[0]:comp[1]]) for comp in comps):
# print('%s splitting %s' % (currenttext, [currenttext[comp[0]:comp[1]] for comp in comps]))
for comp in comps:
span = Span(text=currenttext[comp[0]:comp[1]], start=start+comp[0], end=start+comp[1])
# print('SPLIT: %s - %s' % (currenttext, repr(span)))
split_spans.append(span)
break
else:
split_spans.append(Span(text=currenttext, start=start, end=end))
else:
split_spans.append(Span(text=currenttext, start=start, end=end))
# Do specials
for split_span in split_spans:
for special in SPECIALS:
m = re.search(special, split_span.text)
if m:
# print('%s special %s' % (split_span.text, m.groups()))
for i in range(1, len(m.groups()) + 1):
span = Span(text=m.group(i), start=split_span.start+m.start(i), end=split_span.start+m.end(i))
# print('SUBMATCH: %s - %s' % (currenttext, repr(span)))
spans.append(span)
break
else:
spans.append(split_span)
return spans
@memoized_property
def definitions(self):
"""
Return specifier definitions from this sentence
A definition consists of:
a) A definition -- The quantitity being defined e.g. "Curie Temperature"
b) A specifier -- The symbol used to define the quantity e.g. "Tc"
c) Start -- The index of the starting point of the definition
d) End -- The index of the end point of the definition
:return: list -- The specifier definitions
"""
defs = []
tagged_tokens = [(CONTROL_RE.sub('', token), tag) for token, tag in self.tagged_tokens]
for result in specifier_definition.scan(tagged_tokens):
definition = result[0]
start = result[1]
end = result[2]
new_def = {
'definition': first(definition.xpath('./phrase/text()')),
'specifier': first(definition.xpath('./specifier/text()')),
'tokens': tagged_tokens[start:end],
'start': start,
'end': end}
defs.append(new_def)
return defs
@memoized_property
def chemical_definitions(self):
"""Return a list of chemical entity mentions and their associated label
"""
cem_defs = []
tagged_tokens = [(CONTROL_RE.sub('', token), tag) for token, tag in self.tagged_tokens]
for result in cem_phrase.scan(tagged_tokens):
tree = result[0]
start = result[1]
end = result[2]
name = first(tree.xpath('./compound/names/text()'))
label = first(tree.xpath('./compound/labels/text()'))
if name and label:
cem_def = {
'name': name,
'label': label,
'start': start,
'end': end
}
cem_defs.append(cem_def)
return cem_defs
# for record in self.records:
# if isinstance(record, Compound) and record.labels:
# cem_def = {
# 'label': record.labels[0]
# }
# cem_defs.append(cem_def)
# return cem_defs
@memoized_property
def tags(self):
tags = self.pos_tags
for i, tag in enumerate(self.ner_tags):
if tag is not None:
tags[i] = tag
return tags
@property
def tagged_tokens(self):
"""
A list of (:class:`Token` token, :class:`str` named entity recognition tag)
from the text.
"""
return list(zip(self.raw_tokens, self.tags))
@property
def quantity_re(self):
return construct_quantity_re(*self._streamlined_models)
@property
def records(self):
"""All records found in the object, as a list of :class:`~chemdataextractor.model.base.BaseModel`."""
records = ModelList()
seen_labels = set()
# Ensure no control characters are sent to a parser (need to be XML compatible)
tagged_tokens = [(CONTROL_RE.sub('', token), tag) for token, tag in self.tagged_tokens]
for model in self._streamlined_models:
for parser in model.parsers:
if hasattr(parser, 'parse_sentence'):
for record in parser.parse_sentence(tagged_tokens):
p = record.serialize()
if record.is_empty: # TODO: Potential performance issues?
continue
# Skip duplicate records
if record in records:
continue
# Skip just labels that have already been seen (bit of a hack)
if (isinstance(record, Compound) and 'Compound' in p.keys() and all(k in {'labels', 'roles'} for k in p['Compound'].keys()) and
set(record.labels).issubset(seen_labels)):
continue
if isinstance(record, Compound):
seen_labels.update(record.labels)
# This could be super slow if we find lots of things
found = False
for seen_record in records:
if (isinstance(seen_record, Compound)
and (not set(record.names).isdisjoint(seen_record.names)
or not set(record.labels).isdisjoint(seen_record.labels))):
seen_record.names = sorted(list(set(seen_record.names).union(record.names)))
seen_record.labels = sorted(list(set(seen_record.labels).union(record.labels)))
seen_record.roles = sorted(list(set(seen_record.roles).union(record.roles)))
found = True
if found:
continue
elif hasattr(record, 'compound') and record.compound is not None:
seen_labels.update(record.compound.labels)
records.append(record)
i = 0
length = len(records)
while i < length:
j = 0
while j < length:
if i != j:
records[j].merge_all(records[i])
j += 1
i += 1
return records
def __add__(self, other):
if type(self) == type(other):
merged = self.__class__(
text=self.text + other.text,
start=self.start,
end=None,
id=self.id or other.id,
references=self.references + other.references,
word_tokenizer=self.word_tokenizer,
lexicon=self.lexicon,
abbreviation_detector=self.abbreviation_detector,
pos_tagger=self.pos_tagger,
ner_tagger=self.ner_tagger,
)
return merged
return NotImplemented
[docs]class Cell(Sentence):
"""Data cell for tables. One row of the category table"""
# It appears that using different tokenizers/taggers is making the cem recognition worse.
# This is also consistent with the use of the regular expressions etc we have defined so far.
# word_tokenizer = FineWordTokenizer()
# pos_tagger = NoneTagger()
# ner_tagger = NoneTagger()
[docs] def __init__(self, *args, **kwargs):
super(Cell, self).__init__(*args, **kwargs)
self.data = None
self.row_categories = None
self.col_categories = None
[docs] @classmethod
def from_tdecell(cls, tde_cell, **kwargs):
# Have the spacing between the cells contain characters that will never be found
# so that the system doesn't become confused because it found some number in the heading
# that it confuses as a power for a unit.
text = tde_cell[0] + ' sdfkljlk ' + ' '.join(tde_cell[1]) + ' sdfkljlk ' + ' '.join(tde_cell[2])
cell = cls(text, **kwargs)
cell.data = tde_cell[0]
cell.row_categories = tde_cell[1]
cell.col_categories = tde_cell[2]
# print(cell._streamlined_models, construct_quantity_re(*cell._streamlined_models))
return cell
@memoized_property
def abbreviation_definitions(self):
"""Empty list. Abbreviation detection is disabled within table cells."""
return []
@property
def records(self):
"""Empty list. Individual cells don't provide records, this is handled by the parent Table."""
return []
[docs]@python_2_unicode_compatible
class Span(object):
"""A text span within a sentence."""
[docs] def __init__(self, text, start, end):
"""
:param str text: The text contained by this span.
:param int start: The start offset of this token in the original text.
:param int end: The end offsent of this token in the original text.
"""
self.text = text
"""The :class:`str` text content of this span."""
self.start = start
"""The :class:`int` start offset of this token in the original text."""
self.end = end
"""The :class:`int` end offset of this token in the original text."""
def __repr__(self):
return '%s(%r, %r, %r)' % (self.__class__.__name__, self.text, self.start, self.end)
def __str__(self):
return self.text
def __eq__(self, other):
"""Span objects are equal if the source text is equal, and the start and end indices are equal."""
if not isinstance(other, self.__class__):
return False
return self.text == other.text and self.start == other.start and self.end == other.end
def __ne__(self, other):
return not self == other
def __hash__(self):
return hash((self.text, self.start, self.end))
@property
def length(self):
"""The :class:`int` offset length of this span in the original text."""
return self.end - self.start
[docs]class Token(Span):
"""A single token within a sentence. Corresponds to a word, character, punctuation etc."""
[docs] def __init__(self, text, start, end, lexicon):
"""
:param str text: The text contained by this token.
:param int start: The start offset of this token in the original text.
:param int end: The end offsent of this token in the original text.
:param Lexicon lexicon: The lexicon which contains this token.
"""
super(Token, self).__init__(text, start, end)
#: The lexicon for this token.
self.lexicon = lexicon
self.lexicon.add(text)
@property
def lex(self):
"""The corresponding :class:`chemdataextractor.nlp.lexicon.Lexeme` entry in the Lexicon for this token."""
return self.lexicon[self.text]