Source code for chemdataextractor.parse.base

# -*- coding: utf-8 -*-
"""
Base classes for parsing sentences and tables.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from abc import abstractproperty, abstractmethod
from .quantity import extract_error, extract_units, extract_value
import logging

log = logging.getLogger(__name__)


[docs]class BaseParser(object): """""" model = None trigger_phrase = None skip_section_phrase = None allow_section_phrase = None """ Optional :class:`~chemdataextractor.parse.elements.BaseParserElement` instance. All sentences are run through this before the full root phrase is applied to the sentence. If nothing is found for this phrase, the sentence will not go through the full root phrase. This is done for performance reasons, and if not set, ChemDataExtractor will perform as it did in previous versions. If this phrase is set to an appropriate value, it can help ChemDataExtractor perform at up to 2x its previous speed. To ensure that this works as intended, the :class:`~chemdataextractor.parse.elements.BaseParserElement` should be a simple parse rule (substantially simpler than the :class:`~chemdataextractor.parse.base.BaseParser.root`) that takes little time to process. """ @abstractproperty def root(self): pass
[docs] @abstractmethod def interpret(self, result, start, end): pass
[docs] def extract_error(self, string): """ Extract the error from a string Usage:: bp = BaseParser() test_string = '150±5' end_value = bp.extract_error(test_string) print(end_value) # 5 :param str string: A representation of the value and error as a string :returns: The error expressed as a float . :rtype: float """ return extract_error(string)
[docs] def extract_value(self, string): """ Takes a string and returns a list of floats representing the string given. Usage:: bp = BaseParser() test_string = '150 to 160' end_value = bp.extract_value(test_string) print(end_value) # [150., 160.] :param str string: A representation of the values as a string :returns: The value expressed as a list of floats of length 1 if the value had no range, and as a list of floats of length 2 if it was a range. :rtype: list(float) """ return extract_value(string)
[docs] def extract_units(self, string, strict=False): """ Takes a string and returns a Unit. Raises TypeError if strict and the dimensions do not match the expected dimensions or the string has extraneous characters, e.g. if a string Fe was given, and we were looking for a temperature, strict=False would return Fahrenheit, strinct=True would raise a TypeError. Usage:: bp = QuantityParser() bp.model = QuantityModel() bp.model.dimensions = Temperature() * Length()**0.5 * Time()**(1.5) test_string = 'Kh2/(km/s)-1/2' end_units = bp.extract_units(test_string, strict=True) print(end_units) # Units of: (10^1.5) * Hour^(2.0) Meter^(0.5) Second^(-0.5) Kelvin^(1.0) :param str string: A representation of the units as a string :param bool strict: Whether to raise a TypeError if the dimensions of the parsed units do not have the expected dimensions. :returns: The string expressed as a Unit :rtype: chemdataextractor.quantities.Unit """ return extract_units(string, self.model.dimensions, strict)
[docs]class BaseSentenceParser(BaseParser): """ Base class for parsing sentences. To implement a parser for a new property, impelement the interpret function. """ parse_full_sentence = False
[docs] def should_read_section(self, heading): should_read = True for sentence in heading.sentences: if self.allow_section_phrase is not None: allow_phrase_results = [result for result in self.allow_section_phrase.scan(sentence.tokens)] if allow_phrase_results: should_read = True break if self.skip_section_phrase is not None: skip_phrase_results = [result for result in self.skip_section_phrase.scan(sentence.tokens)] if skip_phrase_results: should_read = False return should_read
[docs] def parse_sentence(self, sentence): """ Parse a sentence. This function is primarily called by the :attr:`~chemdataextractor.doc.text.Sentence.records` property of :class:`~chemdataextractor.doc.text.Sentence`. :param list[(token,tag)] tokens: List of tokens for parsing. When this method is called by :attr:`chemdataextractor.doc.text.Sentence.records`, the tokens passed in are :attr:`chemdataextractor.doc.text.Sentence.tagged_tokens`. :returns: All the models found in the sentence. :rtype: Iterator[:class:`chemdataextractor.model.base.BaseModel`] """ if self.trigger_phrase is not None: trigger_phrase_results = [result for result in self.trigger_phrase.scan(sentence.tokens)] if self.trigger_phrase is None or trigger_phrase_results: for result in self.root.scan(sentence.tokens): for model in self.interpret(*result): yield model
[docs]class BaseTableParser(BaseParser): """ Base class for parsing new-style tables. To implement a parser for a new property, impelement the interpret function. """
[docs] def parse_cell(self, cell): """ Parse a cell. This function is primarily called by the :attr:`~chemdataextractor.doc.table.Table.records` property of :class:`~chemdataextractor.doc.table.Table`. :param list[(token,tag)] tokens: List of tokens for parsing. When this method is called by :attr:`chemdataextractor.doc.text.table.Table`, the tokens passed in are in the same form as :attr:`chemdataextractor.doc.text.Sentence.tagged_tokens`, after the category table has been flattened into a sentence. :returns: All the models found in the table. :rtype: Iterator[:class:`chemdataextractor.model.base.BaseModel`] """ if self.trigger_phrase is not None: trigger_phrase_results = [result for result in self.trigger_phrase.scan(cell.tokens)] if (self.trigger_phrase is None or trigger_phrase_results) and self.root is not None: for result in self.root.scan(cell.tokens): try: for model in self.interpret(*result): yield model except (AttributeError, TypeError) as e: print(e) pass