Source code for chemdataextractor.doc.text

# -*- coding: utf-8 -*-
"""
Text-based document elements.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from abc import abstractproperty
import collections
import logging
import re
from pprint import pprint

import six

from ..model.base import ModelList
from ..nlp.lexicon import ChemLexicon, Lexicon
from ..nlp.cem import CemTagger, IGNORE_PREFIX, IGNORE_SUFFIX, SPECIALS, SPLITS, CiDictCemTagger, CsDictCemTagger, CrfCemTagger
from ..nlp.abbrev import ChemAbbreviationDetector
from ..nlp.tag import NoneTagger
from ..nlp.pos import ChemCrfPosTagger, CrfPosTagger, ApPosTagger, ChemApPosTagger
from ..nlp.tokenize import ChemSentenceTokenizer, ChemWordTokenizer, regex_span_tokenize, SentenceTokenizer, WordTokenizer, FineWordTokenizer
from ..text import CONTROL_RE
from ..utils import memoized_property, python_2_unicode_compatible, first
from .element import BaseElement
from ..parse.definitions import specifier_definition
from ..parse.cem import chemical_name, cem_phrase
from ..parse.quantity import construct_quantity_re
from ..model.model import Compound, NmrSpectrum, IrSpectrum, UvvisSpectrum, MeltingPoint, GlassTransition

log = logging.getLogger(__name__)


[docs]@python_2_unicode_compatible class BaseText(BaseElement): """Abstract base class for a text Document Element."""
[docs] def __init__(self, text, word_tokenizer=None, lexicon=None, abbreviation_detector=None, pos_tagger=None, ner_tagger=None, **kwargs): """ .. note:: If intended as part of a :class:`~chemdataextractor.doc.document.Document`, an element should either be initialized with a reference to its containing document, or its :attr:`document` attribute should be set as soon as possible. If the element is being passed in to a :class:`~chemdataextractor.doc.document.Document` to initialise it, the :attr:`document` attribute is automatically set during the initialisation of the document, so the user does not need to worry about this. :param str text: The text contained in this element. :param WordTokenizer word_tokenizer: (Optional) Word tokenizer for this element. :param Lexicon lexicon: (Optional) Lexicon for this element. The lexicon stores all the occurences of unique words and can provide Brown clusters for the words. :param AbbreviationDetector abbreviation_detector: (Optional) The abbreviation detector for this element. :param BaseTagger pos_tagger: (Optional) The part of speech tagger for this element. :param BaseTagger ner_tagger: (Optional) The named entity recognition tagger for this element. :param Document document: (Optional) The document containing this element. :param str label: (Optional) The label for the captioned element, e.g. Table 1 would have a label of 1. :param Any id: (Optional) Some identifier for this element. Must be equatable. :param list[chemdataextractor.models.BaseModel] models: (Optional) A list of models for this element to parse. If the element is part of another element (e.g. a :class:`~chemdataextractor.doc.text.Sentence` inside a :class:`~chemdataextractor.doc.text.Paragraph`), or is part of a :class:`~chemdataextractor.doc.document.Document`, this is set automatically to be the same as that of the containing element, unless manually set otherwise. """ if not isinstance(text, six.text_type): raise TypeError('Text must be a unicode string') super(BaseText, self).__init__(**kwargs) self._text = text self.word_tokenizer = word_tokenizer if word_tokenizer is not None else self.word_tokenizer self.lexicon = lexicon if lexicon is not None else self.lexicon self.abbreviation_detector = abbreviation_detector if abbreviation_detector is not None else self.abbreviation_detector self.pos_tagger = pos_tagger if pos_tagger is not None else self.pos_tagger self.ner_tagger = ner_tagger if ner_tagger is not None else self.ner_tagger
def __repr__(self): return '%s(id=%r, references=%r, text=%r)' % (self.__class__.__name__, self.id, self.references, self._text) def __str__(self): return self._text @property def text(self): """The raw text :class:`str` for this passage of text.""" return self._text @abstractproperty def word_tokenizer(self): """The :class:`~chemdataextractor.nlp.tokenize.WordTokenizer` used by this element.""" return @abstractproperty def lexicon(self): """The :class:`~chemdataextractor.nlp.lexicon.Lexicon` used by this element.""" return @abstractproperty def pos_tagger(self): """The part of speech tagger used by this element. A subclass of :class:`~chemdataextractor.nlp.tag.BaseTagger`""" return @abstractproperty def ner_tagger(self): """The named entity recognition tagger used by this element. A subclass of :class:`~chemdataextractor.nlp.tag.BaseTagger`""" return @abstractproperty def tokens(self): """A list of :class:`Token` s for this object.""" return @abstractproperty def tags(self): """ A list of tags corresponding to each of the tokens in the object. For information on what each of the tags can be, check the documentation on the specific :attr:`ner_tagger` and :attr:`pos_tagger` used for this class. """ return @abstractproperty def definitions(self): """ A list of all specifier definitions """ return @abstractproperty def chemical_definitions(self): """A list of all chemical label definitiond """ return
[docs] def serialize(self): """ Convert self to a dictionary. The key 'type' will contain the name of the class being serialized, and the key 'content' will contain a serialized representation of :attr:`text`, which is a :class:`str` """ data = {'type': self.__class__.__name__, 'content': self.text} return data
def _repr_html_(self): return self.text
[docs]class Text(collections.Sequence, BaseText): """A passage of text, comprising one or more sentences.""" sentence_tokenizer = ChemSentenceTokenizer() word_tokenizer = ChemWordTokenizer() lexicon = ChemLexicon() abbreviation_detector = ChemAbbreviationDetector() pos_tagger = ChemCrfPosTagger() # ChemPerceptronTagger() ner_tagger = CemTagger()
[docs] def __init__(self, text, sentence_tokenizer=None, word_tokenizer=None, lexicon=None, abbreviation_detector=None, pos_tagger=None, ner_tagger=None, parsers=None, **kwargs): """ .. note:: If intended as part of a :class:`~chemdataextractor.doc.document.Document`, an element should either be initialized with a reference to its containing document, or its :attr:`document` attribute should be set as soon as possible. If the element is being passed in to a :class:`~chemdataextractor.doc.document.Document` to initialise it, the :attr:`document` attribute is automatically set during the initialisation of the document, so the user does not need to worry about this. :param str text: The text contained in this element. :param SentenceTokenizer sentence_tokenizer: (Optional) Sentence tokenizer for this element. Default :class:`~chemdataextractor.nlp.tokenize.ChemSentenceTokenizer`. :param WordTokenizer word_tokenizer: (Optional) Word tokenizer for this element. Default :class:`~chemdataextractor.nlp.tokenize.ChemWordTokenizer`. :param Lexicon lexicon: (Optional) Lexicon for this element. The lexicon stores all the occurences of unique words and can provide Brown clusters for the words. Default :class:`~chemdataextractor.nlp.lexicon.ChemLexicon` :param AbbreviationDetector abbreviation_detector: (Optional) The abbreviation detector for this element. Default :class:`~chemdataextractor.nlp.abbrev.ChemAbbreviationDetector`. :param BaseTagger pos_tagger: (Optional) The part of speech tagger for this element. Default :class:`~chemdataextractor.nlp.pos.ChemCrfPosTagger`. :param BaseTagger ner_tagger: (Optional) The named entity recognition tagger for this element. Default :class:`~chemdataextractor.nlp.cem.CemTagger` :param Document document: (Optional) The document containing this element. :param str label: (Optional) The label for the captioned element, e.g. Table 1 would have a label of 1. :param Any id: (Optional) Some identifier for this element. Must be equatable. :param list[chemdataextractor.models.BaseModel] models: (Optional) A list of models for this element to parse. If the element is part of another element (e.g. a :class:`~chemdataextractor.doc.text.Sentence` inside a :class:`~chemdataextractor.doc.text.Paragraph`), or is part of a :class:`~chemdataextractor.doc.document.Document`, this is set automatically to be the same as that of the containing element, unless manually set otherwise. """ super(Text, self).__init__(text, word_tokenizer=word_tokenizer, lexicon=lexicon, abbreviation_detector=abbreviation_detector, pos_tagger=pos_tagger, ner_tagger=ner_tagger, parsers=None, **kwargs) self.sentence_tokenizer = sentence_tokenizer if sentence_tokenizer is not None else self.sentence_tokenizer
def __getitem__(self, index): return self.sentences[index] def __len__(self): return len(self.sentences)
[docs] def set_config(self): """ Load settings from configuration file .. note:: Called when Document instance is created """ if self.document is None: pass else: c = self.document.config if 'SENTENCE_TOKENIZER' in c.keys(): self.sentence_tokenizer = eval(c['SENTENCE_TOKENIZER'])() if 'WORD_TOKENIZER' in c.keys(): self.word_tokenizer = eval(c['WORD_TOKENIZER'])() if 'POS_TAGGER' in c.keys(): self.pos_tagger = eval(c['POS_TAGGER'])() if 'NER_TAGGER' in c.keys(): self.ner_tagger = eval(c['NER_TAGGER'])() if 'LEXICON' in c.keys(): self.lexicon = eval(c['LEXICON'])() if 'PARSERS' in c.keys(): raise(DeprecationWarning('Manually setting parsers deprecated, any settings from config files for this will be ignored.'))
@memoized_property def sentences(self): """A list of :class:`Sentence` s that make up this text passage.""" return self.sentence_tokenizer.get_sentences(self) def _sentences_from_spans(self, spans): sents = [] for span in spans: sent = Sentence( text=self.text[span[0]:span[1]], start=span[0], end=span[1], word_tokenizer=self.word_tokenizer, lexicon=self.lexicon, abbreviation_detector=self.abbreviation_detector, pos_tagger=self.pos_tagger, ner_tagger=self.ner_tagger, document=self.document, models=self.models ) sents.append(sent) return sents @property def raw_sentences(self): """A list of :class:`str` for the sentences that make up this text passage.""" return [sentence.text for sentence in self.sentences] @property def tokens(self): return [sent.tokens for sent in self.sentences] @property def raw_tokens(self): """A list of :class:`str` representations for the tokens of each sentence in this text passage.""" return [sent.raw_tokens for sent in self.sentences] @property def pos_tagged_tokens(self): """A list of (:class:`Token` token, :class:`str` tag) tuples for each sentence in this text passage.""" return [sent.pos_tagged_tokens for sent in self.sentences] @property def pos_tags(self): """A list of :class:`str` part of speech tags for each sentence in this text passage.""" return [sent.pos_tags for sent in self.sentences] @property def unprocessed_ner_tagged_tokens(self): """ A list of (:class:`Token` token, :class:`str` named entity recognition tag) from the text. No corrections from abbreviation detection are performed. """ return [sent.unprocessed_ner_tagged_tokens for sent in self.sentences] @property def unprocessed_ner_tags(self): """ A list of :class:`str` unprocessed named entity tags for the tokens in this sentence. No corrections from abbreviation detection are performed. """ return [sent.unprocessed_ner_tags for sent in self.sentences] @property def ner_tagged_tokens(self): """ A list of (:class:`Token` token, :class:`str` named entity recognition tag) from the text. """ return [sent.ner_tagged_tokens for sent in self.sentences] @property def ner_tags(self): """ A list of named entity tags corresponding to each of the tokens in the object. For information on what each of the tags can be, check the documentation on the specific :attr:`ner_tagger` used for this object. """ return [sent.ner_tags for sent in self.sentences] @property def cems(self): """ A list of all Chemical Entity Mentions in this text as :class:`chemdataextractor.doc.text.span` """ return [cem for sent in self.sentences for cem in sent.cems] @property def definitions(self): """ Return a list of tagged definitions for each sentence in this text passage """ return [definition for sent in self.sentences for definition in sent.definitions] @property def chemical_definitions(self): """ Return a list of tagged definitions for each sentence in this text passage """ return [definition for sent in self.sentences for definition in sent.chemical_definitions] @property def tagged_tokens(self): """ A list of (:class:`Token` token, :class:`str` named entity recognition tag) from the text. """ return [sent.tagged_tokens for sent in self.sentences] @property def tags(self): return [sent.tags for sent in self.sentences] @property def abbreviation_definitions(self): """ A list of all abbreviation definitions in this Document. Each abbreviation is in the form (:class:`str` abbreviation, :class:`str` long form of abbreviation, :class:`str` ner_tag) """ return [ab for sent in self.sentences for ab in sent.abbreviation_definitions] @property def records(self): """All records found in the object, as a list of :class:`~chemdataextractor.model.base.BaseModel`.""" return ModelList(*[r for sent in self.sentences for r in sent.records]) def __add__(self, other): if type(self) == type(other): merged = self.__class__( text=self.text + other.text, id=self.id or other.id, references=self.references + other.references, sentence_tokenizer=self.sentence_tokenizer, word_tokenizer=self.word_tokenizer, lexicon=self.lexicon, abbreviation_detector=self.abbreviation_detector, pos_tagger=self.pos_tagger, ner_tagger=self.ner_tagger, ) return merged return NotImplemented
[docs]class Title(Text):
[docs] def __init__(self, text, **kwargs): super(Title, self).__init__(text, **kwargs) self.models = []
def _repr_html_(self): return '<h1 class="cde-title">' + self.text + '</h1>'
[docs]class Heading(Text):
[docs] def __init__(self, text, **kwargs): super(Heading, self).__init__(text, **kwargs) self.models = []
# default_parsers = [CompoundHeadingParser(), ChemicalLabelParser()] def _repr_html_(self): return '<h2 class="cde-title">' + self.text + '</h2>'
[docs]class Paragraph(Text):
[docs] def __init__(self, text, **kwargs): super(Paragraph, self).__init__(text, **kwargs) # default_parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(), # TgParser(), ContextParser()] self.models = []
def _repr_html_(self): return '<p class="cde-paragraph">' + self.text + '</p>'
[docs]class Footnote(Text):
[docs] def __init__(self, text, **kwargs): super(Footnote, self).__init__(text, **kwargs) # default_parsers = [ContextParser(), CaptionContextParser()] self.models = []
def _repr_html_(self): return '<p class="cde-footnote">' + self.text + '</p>'
[docs]class Citation(Text): ner_tagger = NoneTagger() #: No tagging is done for citations abbreviation_detector = None # TODO: Citation parser # TODO: Store number/label def _repr_html_(self): return '<p class="cde-citation">' + self.text + '</p>'
[docs]class Caption(Text):
[docs] def __init__(self, text, **kwargs): super(Caption, self).__init__(text, **kwargs) self.models = []
# default_parsers = [CompoundParser(), ChemicalLabelParser(), CaptionContextParser()] def _repr_html_(self): return '<caption class="cde-caption">' + self.text + '</caption>' @property def definitions(self): return [definition for sent in self.sentences for definition in sent.definitions]
[docs]class Sentence(BaseText): """A single sentence within a text passage.""" word_tokenizer = ChemWordTokenizer() lexicon = ChemLexicon() abbreviation_detector = ChemAbbreviationDetector() pos_tagger = ChemCrfPosTagger() # ChemPerceptronTagger() ner_tagger = CemTagger()
[docs] def __init__(self, text, start=0, end=None, word_tokenizer=None, lexicon=None, abbreviation_detector=None, pos_tagger=None, ner_tagger=None, **kwargs): """ .. note:: If intended as part of a :class:`chemdataextractor.doc.document.Document`, an element should either be initialized with a reference to its containing document, or its :attr:`document` attribute should be set as soon as possible. If the element is being passed in to a :class:`chemdataextractor.doc.document.Document` to initialise it, the :attr:`document` attribute is automatically set during the initialisation of the document, so the user does not need to worry about this. :param str text: The text contained in this element. :param int start: (Optional) The starting index of the sentence within the containing element. Default 0. :param int end: (Optional) The end index of the sentence within the containing element. Defualt None :param WordTokenizer word_tokenizer: (Optional) Word tokenizer for this element. Default :class:`~chemdataextractor.nlp.tokenize.ChemWordTokenizer`. :param Lexicon lexicon: (Optional) Lexicon for this element. The lexicon stores all the occurences of unique words and can provide Brown clusters for the words. Default :class:`~chemdataextractor.nlp.lexicon.ChemLexicon` :param AbbreviationDetector abbreviation_detector: (Optional) The abbreviation detector for this element. Default :class:`~chemdataextractor.nlp.abbrev.ChemAbbreviationDetector`. :param BaseTagger pos_tagger: (Optional) The part of speech tagger for this element. Default :class:`~chemdataextractor.nlp.pos.ChemCrfPosTagger`. :param BaseTagger ner_tagger: (Optional) The named entity recognition tagger for this element. Default :class:`~chemdataextractor.nlp.cem.CemTagger` :param Document document: (Optional) The document containing this element. :param str label: (Optional) The label for the captioned element, e.g. Table 1 would have a label of 1. :param Any id: (Optional) Some identifier for this element. Must be equatable. :param list[chemdataextractor.models.BaseModel] models: (Optional) A list of models for this element to parse. If the element is part of another element (e.g. a :class:`~chemdataextractor.doc.text.Sentence` inside a :class:`~chemdataextractor.doc.text.Paragraph`), or is part of a :class:`~chemdataextractor.doc.document.Document`, this is set automatically to be the same as that of the containing element, unless manually set otherwise. """ self.models = [] super(Sentence, self).__init__(text, word_tokenizer=word_tokenizer, lexicon=lexicon, abbreviation_detector=abbreviation_detector, pos_tagger=pos_tagger, ner_tagger=ner_tagger, **kwargs) #: The start index of this sentence within the text passage. self.start = start #: The end index of this sentence within the text passage. self.end = end if end is not None else len(text)
def __repr__(self): return '%s(%r, %r, %r)' % (self.__class__.__name__, self._text, self.start, self.end) @memoized_property def tokens(self): return self.word_tokenizer.get_word_tokens(self) def _tokens_for_spans(self, spans): toks = [Token( text=self.text[span[0]:span[1]], start=span[0] + self.start, end=span[1] + self.start, lexicon=self.lexicon ) for span in spans] return toks @property def raw_tokens(self): """A list of :class:`str` representations for the tokens in the object.""" return [token.text for token in self.tokens] @memoized_property def pos_tagged_tokens(self): """A list of (:class:`Token` token, :class:`str` tag) tuples for each sentence in this sentence.""" # log.debug('Getting pos tags') return self.pos_tagger.tag(self.raw_tokens) @property def pos_tags(self): """A list of :class:`str` part of speech tags for each sentence in this sentence.""" return [tag for token, tag in self.pos_tagged_tokens] @memoized_property def unprocessed_ner_tagged_tokens(self): """ A list of (:class:`Token` token, :class:`str` named entity recognition tag) from the text. No corrections from abbreviation detection are performed. """ # log.debug('Getting unprocessed_ner_tags') return self.ner_tagger.tag(self.pos_tagged_tokens) @memoized_property def unprocessed_ner_tags(self): """ A list of :class:`str` unprocessed named entity tags for the tokens in this sentence. No corrections from abbreviation detection are performed. """ return [tag for token, tag in self.unprocessed_ner_tagged_tokens] @memoized_property def abbreviation_definitions(self): """ A list of all abbreviation definitions in this Document. Each abbreviation is in the form (:class:`str` abbreviation, :class:`str` long form of abbreviation, :class:`str` ner_tag) """ abbreviations = [] if self.abbreviation_detector: # log.debug('Detecting abbreviations') ners = self.unprocessed_ner_tags for abbr_span, long_span in self.abbreviation_detector.detect_spans(self.raw_tokens): abbr = self.raw_tokens[abbr_span[0]:abbr_span[1]] long = self.raw_tokens[long_span[0]:long_span[1]] # Check if long is entirely tagged as one named entity type long_tags = ners[long_span[0]:long_span[1]] unique_tags = set([tag[2:] for tag in long_tags if tag is not None]) tag = long_tags[0][2:] if None not in long_tags and len(unique_tags) == 1 else None abbreviations.append((abbr, long, tag)) return abbreviations @memoized_property def ner_tagged_tokens(self): """ A list of (:class:`Token` token, :class:`str` named entity recognition tag) from the sentence. """ return list(zip(self.raw_tokens, self.ner_tags)) @memoized_property def ner_tags(self): """ A list of named entity tags corresponding to each of the tokens in the object. For information on what each of the tags can be, check the documentation on the specific :attr:`ner_tagger` used for this object. """ # log.debug('Getting ner_tags') ner_tags = self.unprocessed_ner_tags abbrev_defs = self.document.abbreviation_definitions if self.document else self.abbreviation_definitions # Ensure abbreviation entity matches long entity # TODO: This is potentially a performance bottleneck? for i in range(0, len(ner_tags)): for abbr, long, ner_tag in abbrev_defs: if abbr == self.raw_tokens[i:i+len(abbr)]: old_ner_tags = ner_tags[i:i+len(abbr)] ner_tags[i] = 'B-%s' % ner_tag if ner_tag is not None else None ner_tags[i+1:i+len(abbr)] = ['I-%s' % ner_tag if ner_tag is not None else None] * (len(abbr) - 1) # Remove ner tags from brackets surrounding abbreviation if i > 1 and self.raw_tokens[i-1] == '(': ner_tags[i-1] = None if i < len(self.raw_tokens) - 1 and self.raw_tokens[i+1] == ')': ner_tags[i+1] = None if not old_ner_tags == ner_tags[i:i+len(abbr)]: log.debug('Correcting abbreviation tag: %s (%s): %s -> %s' % (' '.join(abbr), ' '.join(long), old_ner_tags, ner_tags[i:i+len(abbr)])) # TODO: Ensure abbreviations in brackets at the end of an entity match are separated and the brackets untagged # Hydrogen Peroxide (H2O2) # Tungsten Carbide (WC) # TODO: Filter off alphanumerics from end (1h) (3) (I) # May need more intelligent return ner_tags @memoized_property def cems(self): """ A list of all Chemical Entity Mentions in this text as :class:`~chemdataextractor.doc.text.Span` """ # log.debug('Getting cems') spans = [] # print(self.text.encode('utf8')) for result in chemical_name.scan(self.tagged_tokens): # parser scan yields (result, startindex, endindex) - we just use the indexes here tokens = self.tokens[result[1]:result[2]] start = tokens[0].start end = tokens[-1].end # Adjust boundaries to exclude disallowed prefixes/suffixes currenttext = self.text[start-self.start:end-self.start].lower() for prefix in IGNORE_PREFIX: if currenttext.startswith(prefix): # print('%s removing %s' % (currenttext, prefix)) start += len(prefix) break for suffix in IGNORE_SUFFIX: if currenttext.endswith(suffix): # print('%s removing %s' % (currenttext, suffix)) end -= len(suffix) break # Adjust boundaries to exclude matching brackets at start and end currenttext = self.text[start-self.start:end-self.start] for bpair in [('(', ')'), ('[', ']')]: if len(currenttext) > 2 and currenttext[0] == bpair[0] and currenttext[-1] == bpair[1]: level = 1 for k, char in enumerate(currenttext[1:]): if char == bpair[0]: level += 1 elif char == bpair[1]: level -= 1 if level == 0 and k == len(currenttext) - 2: start += 1 end -= 1 break # If entity has been reduced to nothing by adjusting boundaries, skip it if start >= end: continue currenttext = self.text[start-self.start:end-self.start] # Do splits split_spans = [] comps = list(regex_span_tokenize(currenttext, '(-|\+|\)?-to-\(?|ยทยทยท|/|\s)')) if len(comps) > 1: for split in SPLITS: if all(re.search(split, currenttext[comp[0]:comp[1]]) for comp in comps): # print('%s splitting %s' % (currenttext, [currenttext[comp[0]:comp[1]] for comp in comps])) for comp in comps: span = Span(text=currenttext[comp[0]:comp[1]], start=start+comp[0], end=start+comp[1]) # print('SPLIT: %s - %s' % (currenttext, repr(span))) split_spans.append(span) break else: split_spans.append(Span(text=currenttext, start=start, end=end)) else: split_spans.append(Span(text=currenttext, start=start, end=end)) # Do specials for split_span in split_spans: for special in SPECIALS: m = re.search(special, split_span.text) if m: # print('%s special %s' % (split_span.text, m.groups())) for i in range(1, len(m.groups()) + 1): span = Span(text=m.group(i), start=split_span.start+m.start(i), end=split_span.start+m.end(i)) # print('SUBMATCH: %s - %s' % (currenttext, repr(span))) spans.append(span) break else: spans.append(split_span) return spans @memoized_property def definitions(self): """ Return specifier definitions from this sentence A definition consists of: a) A definition -- The quantitity being defined e.g. "Curie Temperature" b) A specifier -- The symbol used to define the quantity e.g. "Tc" c) Start -- The index of the starting point of the definition d) End -- The index of the end point of the definition :return: list -- The specifier definitions """ defs = [] tagged_tokens = [(CONTROL_RE.sub('', token), tag) for token, tag in self.tagged_tokens] for result in specifier_definition.scan(tagged_tokens): definition = result[0] start = result[1] end = result[2] new_def = { 'definition': first(definition.xpath('./phrase/text()')), 'specifier': first(definition.xpath('./specifier/text()')), 'tokens': tagged_tokens[start:end], 'start': start, 'end': end} defs.append(new_def) return defs @memoized_property def chemical_definitions(self): """Return a list of chemical entity mentions and their associated label """ cem_defs = [] tagged_tokens = [(CONTROL_RE.sub('', token), tag) for token, tag in self.tagged_tokens] for result in cem_phrase.scan(tagged_tokens): tree = result[0] start = result[1] end = result[2] name = first(tree.xpath('./compound/names/text()')) label = first(tree.xpath('./compound/labels/text()')) if name and label: cem_def = { 'name': name, 'label': label, 'start': start, 'end': end } cem_defs.append(cem_def) return cem_defs # for record in self.records: # if isinstance(record, Compound) and record.labels: # cem_def = { # 'label': record.labels[0] # } # cem_defs.append(cem_def) # return cem_defs @memoized_property def tags(self): tags = self.pos_tags for i, tag in enumerate(self.ner_tags): if tag is not None: tags[i] = tag return tags @property def tagged_tokens(self): """ A list of (:class:`Token` token, :class:`str` named entity recognition tag) from the text. """ return list(zip(self.raw_tokens, self.tags)) @property def quantity_re(self): return construct_quantity_re(*self._streamlined_models) @property def records(self): """All records found in the object, as a list of :class:`~chemdataextractor.model.base.BaseModel`.""" records = ModelList() seen_labels = set() # Ensure no control characters are sent to a parser (need to be XML compatible) tagged_tokens = [(CONTROL_RE.sub('', token), tag) for token, tag in self.tagged_tokens] for model in self._streamlined_models: for parser in model.parsers: if hasattr(parser, 'parse_sentence'): for record in parser.parse_sentence(tagged_tokens): p = record.serialize() if record.is_empty: # TODO: Potential performance issues? continue # Skip duplicate records if record in records: continue # Skip just labels that have already been seen (bit of a hack) if (isinstance(record, Compound) and 'Compound' in p.keys() and all(k in {'labels', 'roles'} for k in p['Compound'].keys()) and set(record.labels).issubset(seen_labels)): continue if isinstance(record, Compound): seen_labels.update(record.labels) # This could be super slow if we find lots of things found = False for seen_record in records: if (isinstance(seen_record, Compound) and (not set(record.names).isdisjoint(seen_record.names) or not set(record.labels).isdisjoint(seen_record.labels))): seen_record.names = sorted(list(set(seen_record.names).union(record.names))) seen_record.labels = sorted(list(set(seen_record.labels).union(record.labels))) seen_record.roles = sorted(list(set(seen_record.roles).union(record.roles))) found = True if found: continue elif hasattr(record, 'compound') and record.compound is not None: seen_labels.update(record.compound.labels) records.append(record) i = 0 length = len(records) while i < length: j = 0 while j < length: if i != j: records[j].merge_all(records[i]) j += 1 i += 1 return records def __add__(self, other): if type(self) == type(other): merged = self.__class__( text=self.text + other.text, start=self.start, end=None, id=self.id or other.id, references=self.references + other.references, word_tokenizer=self.word_tokenizer, lexicon=self.lexicon, abbreviation_detector=self.abbreviation_detector, pos_tagger=self.pos_tagger, ner_tagger=self.ner_tagger, ) return merged return NotImplemented
[docs]class Cell(Sentence): """Data cell for tables. One row of the category table""" # It appears that using different tokenizers/taggers is making the cem recognition worse. # This is also consistent with the use of the regular expressions etc we have defined so far. # word_tokenizer = FineWordTokenizer() # pos_tagger = NoneTagger() # ner_tagger = NoneTagger()
[docs] def __init__(self, *args, **kwargs): super(Cell, self).__init__(*args, **kwargs) self.data = None self.row_categories = None self.col_categories = None
[docs] @classmethod def from_tdecell(cls, tde_cell, **kwargs): # Have the spacing between the cells contain characters that will never be found # so that the system doesn't become confused because it found some number in the heading # that it confuses as a power for a unit. text = tde_cell[0] + ' sdfkljlk ' + ' '.join(tde_cell[1]) + ' sdfkljlk ' + ' '.join(tde_cell[2]) cell = cls(text, **kwargs) cell.data = tde_cell[0] cell.row_categories = tde_cell[1] cell.col_categories = tde_cell[2] # print(cell._streamlined_models, construct_quantity_re(*cell._streamlined_models)) return cell
@memoized_property def abbreviation_definitions(self): """Empty list. Abbreviation detection is disabled within table cells.""" return [] @property def records(self): """Empty list. Individual cells don't provide records, this is handled by the parent Table.""" return []
[docs]@python_2_unicode_compatible class Span(object): """A text span within a sentence."""
[docs] def __init__(self, text, start, end): """ :param str text: The text contained by this span. :param int start: The start offset of this token in the original text. :param int end: The end offsent of this token in the original text. """ self.text = text """The :class:`str` text content of this span.""" self.start = start """The :class:`int` start offset of this token in the original text.""" self.end = end """The :class:`int` end offset of this token in the original text."""
def __repr__(self): return '%s(%r, %r, %r)' % (self.__class__.__name__, self.text, self.start, self.end) def __str__(self): return self.text def __eq__(self, other): """Span objects are equal if the source text is equal, and the start and end indices are equal.""" if not isinstance(other, self.__class__): return False return self.text == other.text and self.start == other.start and self.end == other.end def __ne__(self, other): return not self == other def __hash__(self): return hash((self.text, self.start, self.end)) @property def length(self): """The :class:`int` offset length of this span in the original text.""" return self.end - self.start
[docs]class Token(Span): """A single token within a sentence. Corresponds to a word, character, punctuation etc."""
[docs] def __init__(self, text, start, end, lexicon): """ :param str text: The text contained by this token. :param int start: The start offset of this token in the original text. :param int end: The end offsent of this token in the original text. :param Lexicon lexicon: The lexicon which contains this token. """ super(Token, self).__init__(text, start, end) #: The lexicon for this token. self.lexicon = lexicon self.lexicon.add(text)
@property def lex(self): """The corresponding :class:`chemdataextractor.nlp.lexicon.Lexeme` entry in the Lexicon for this token.""" return self.lexicon[self.text]