# -*- coding: utf-8 -*-
"""
Base classes for parsing sentences and tables.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from abc import abstractproperty, abstractmethod
from .quantity import extract_error, extract_units, extract_value
import logging
log = logging.getLogger(__name__)
[docs]class BaseParser(object):
""""""
model = None
trigger_phrase = None
skip_section_phrase = None
allow_section_phrase = None
"""
Optional :class:`~chemdataextractor.parse.elements.BaseParserElement` instance.
All sentences are run through this before the full root phrase is applied to the
sentence. If nothing is found for this phrase, the sentence will not go through
the full root phrase. This is done for performance reasons, and if not set,
ChemDataExtractor will perform as it did in previous versions. If this phrase is
set to an appropriate value, it can help ChemDataExtractor perform at up to 2x
its previous speed.
To ensure that this works as intended, the :class:`~chemdataextractor.parse.elements.BaseParserElement`
should be a simple parse rule (substantially simpler than the :class:`~chemdataextractor.parse.base.BaseParser.root`)
that takes little time to process.
"""
@abstractproperty
def root(self):
pass
[docs] @abstractmethod
def interpret(self, result, start, end):
pass
[docs]class BaseSentenceParser(BaseParser):
"""
Base class for parsing sentences. To implement a parser for a new property,
impelement the interpret function.
"""
parse_full_sentence = False
[docs] def should_read_section(self, heading):
should_read = True
for sentence in heading.sentences:
if self.allow_section_phrase is not None:
allow_phrase_results = [result for result in self.allow_section_phrase.scan(sentence.tokens)]
if allow_phrase_results:
should_read = True
break
if self.skip_section_phrase is not None:
skip_phrase_results = [result for result in self.skip_section_phrase.scan(sentence.tokens)]
if skip_phrase_results:
should_read = False
return should_read
[docs] def parse_sentence(self, sentence):
"""
Parse a sentence. This function is primarily called by the
:attr:`~chemdataextractor.doc.text.Sentence.records` property of
:class:`~chemdataextractor.doc.text.Sentence`.
:param list[(token,tag)] tokens: List of tokens for parsing. When this method
is called by :attr:`chemdataextractor.doc.text.Sentence.records`,
the tokens passed in are :attr:`chemdataextractor.doc.text.Sentence.tagged_tokens`.
:returns: All the models found in the sentence.
:rtype: Iterator[:class:`chemdataextractor.model.base.BaseModel`]
"""
if self.trigger_phrase is not None:
trigger_phrase_results = [result for result in self.trigger_phrase.scan(sentence.tokens)]
if self.trigger_phrase is None or trigger_phrase_results:
for result in self.root.scan(sentence.tokens):
for model in self.interpret(*result):
yield model
[docs]class BaseTableParser(BaseParser):
"""
Base class for parsing new-style tables. To implement a parser for a new property,
impelement the interpret function.
"""
[docs] def parse_cell(self, cell):
"""
Parse a cell. This function is primarily called by the
:attr:`~chemdataextractor.doc.table.Table.records` property of
:class:`~chemdataextractor.doc.table.Table`.
:param list[(token,tag)] tokens: List of tokens for parsing. When this method
is called by :attr:`chemdataextractor.doc.text.table.Table`,
the tokens passed in are in the same form as
:attr:`chemdataextractor.doc.text.Sentence.tagged_tokens`, after the
category table has been flattened into a sentence.
:returns: All the models found in the table.
:rtype: Iterator[:class:`chemdataextractor.model.base.BaseModel`]
"""
if self.trigger_phrase is not None:
trigger_phrase_results = [result for result in self.trigger_phrase.scan(cell.tokens)]
if (self.trigger_phrase is None or trigger_phrase_results) and self.root is not None:
for result in self.root.scan(cell.tokens):
try:
for model in self.interpret(*result):
yield model
except (AttributeError, TypeError) as e:
print(e)
pass