Source code for chemdataextractor.parse.auto

# -*- coding: utf-8 -*-
"""
Parser for automatic parsing, without user-written parsing rules.
Mainly used for tables.

Models must be constructed in a certain way for them to work optimally with autoparsers. Namely, they should have:

- A specifier field with an associated parse expression (Optional, only required if autoparsers are desired). These parse expressions will be updated automatically using forward-looking Interdependency Resolution if the updatable flag is set to True.
- These specifiers should also have required set to True so that spurious matches are not found.
- If applicable, a compound entity, named compound.

Any parse_expressions set in the model should have an added action to ensure that the results are a single word. An example would be to call add_action(join) on each parse expression.

.. codeauthor:: Taketomo Isazawa <[email protected]>

.. codeauthor:: Juraj Mavračić <[email protected]>
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import six
import copy

from .cem import cem, chemical_label, lenient_chemical_label
from .actions import merge, join
from .elements import W, I, R, T, Optional, Any, OneOrMore, Not, ZeroOrMore, Group, SkipTo, Or
from ..utils import first
from .quantity import magnitudes_dict, value_element, extract_units, value_element_plain, lbrct, rbrct
from .base import BaseSentenceParser, BaseParser, BaseTableParser

import xml.etree.ElementTree as etree

log = logging.getLogger(__name__)


[docs]def construct_unit_element(dimensions):
    """
    Construct an element for detecting units for the dimensions given.
    Any magnitude modifiers (e.g. kilo) will be automatically handled.

    :param Dimension dimensions: The dimensions that the element produced will look for.
    :returns: An Element to look for units of given dimensions. If None or Dimensionless are passed in, returns None.
    :rtype: BaseParserElement or None
    """
    if not dimensions or not dimensions.units_dict:
        return None
    # Handle all the magnitudes
    units_regex = '^(('
    for element in magnitudes_dict.keys():
        units_regex += '(' + element.pattern + ')|'
    units_regex = units_regex[:-1]
    units_regex += ')?'
    units_regex += '('
    # Case where we have a token that's just brackets
    units_regex += r'((\(|\[))|((\)|\]))|\-|'
    # Handle all the units
    for element in dimensions.units_dict:
        units_regex += '(' + element.pattern + ')|'
    units_regex += r'(\/)'
    # Case when we have powers, or one or more units
    units_regex2 = units_regex + r'|([\+\-–−]?\d+(\.\d+)?)'
    units_regex2 += '))+$'
    units_regex += '))+'
    units_regex += (units_regex2[1:-2] + '*')
    units_regex += '$'
    return (R(pattern=units_regex) + ZeroOrMore(R(pattern=units_regex) | R(pattern=units_regex2))).add_action(merge)


[docs]def construct_category_element(category_dict):
    """
    Construct an element for detecting categories.

    :param Category category: The Category to look for.
    :rtype: BaseParserElement or None
    """
    category_regex = '^'
    if not category_dict:
        return None
    # Handle all the units
    for element in category_dict:
        category_regex += '(' + element.pattern + ')|'
    category_regex = category_regex[:-1]
    category_regex += '$'
    return (R(pattern=category_regex))('raw_value').add_action(merge)


[docs]def match_dimensions_of(model):
    """
    Produces a function that checks whether the given results of parsing match the
    dimensions of the model provided.

    :param QuantityModel model: The model with which to check dimensions.
    :returns: A function which will return True if the results of parsing match the model's dimensions, False if not.
    :rtype: function(tuple(list(Element), int) -> bool)
    """
    def check_match(result):
        try:
            extract_units(result[0].text, model.dimensions, strict=True)
            return True
        except TypeError as e:
            log.debug(e)
            return False
    return check_match


[docs]def create_entities_list(entities):
    """
    For a list of Base parser entities, creates an entity of structure. For example, with 4 entities in the list, the output is::

        (entities[0] | entities[1] | entities[2] | entities[3])

    :param entities: BaseParserElement type objects
    :return: BaseParserElement type object
    """
    result = entities[0]
    for entity in entities[1:]:
        result = (result | entity)
    return result


[docs]class BaseAutoParser(BaseParser):
    model = None
    _specifier = None
    _root_phrase = None

[docs]    def __init__(self):
        super(BaseAutoParser, self).__init__()
        self._trigger_property = None

[docs]    def interpret(self, result, start, end):
        # print(etree.tostring(result))
        if result is None:
            return
        property_entities = {}

        if hasattr(self.model, 'dimensions') and not self.model.dimensions:
            # the specific entities of a DimensionlessModel are retrieved explicitly and packed into a dictionary
            raw_value = first(result.xpath('./raw_value/text()'))
            log.debug(raw_value)
            if raw_value != 'NoValue':
                value = self.extract_value(raw_value)
            else:
                value = None
            error = self.extract_error(raw_value)
            property_entities.update({"raw_value": raw_value,
                                      "value": value,
                                      "error": error})

        elif hasattr(self.model, 'dimensions') and self.model.dimensions:
            # the specific entities of a QuantityModel are retrieved explicitly and packed into a dictionary
            # print(etree.tostring(result))
            raw_value = first(result.xpath('./raw_value/text()'))
            raw_units = first(result.xpath('./raw_units/text()'))
            if raw_value != 'NoValue':
                value = self.extract_value(raw_value)
            else:
                value = None
            error = self.extract_error(raw_value)
            units = None
            try:
                units = self.extract_units(raw_units, strict=True)
            except TypeError as e:
                log.debug(e)

            property_entities.update({"raw_value": raw_value,
                                      "raw_units": raw_units,
                                      "value": value,
                                      "error": error,
                                      "units": units})

        for field_name, field in six.iteritems(self.model.fields):
            if field_name not in ['raw_value', 'raw_units', 'value', 'units', 'error']:
                try:
                    data = self._get_data(field_name, field, result)
                    if data is not None:
                        property_entities.update(data)
                # if field is required, but empty, the requirements have not been met
                except TypeError as e:
                    log.debug(self.model)
                    log.debug(e)

        model_instance = self.model(**property_entities)

        if model_instance.noncontextual_required_fulfilled:
            # records the parser that was used to generate this record, can be used for evaluation
            model_instance.record_method = self.__class__.__name__
            yield model_instance

    def _get_data(self, field_name, field, result):
        if hasattr(field, 'model_class'):
            field_result = first(result.xpath('./' + field_name))
            if field_result is None and field.required and not field.contextual:
                raise TypeError('Could not find element for ' + str(field_name))
            elif field_result is None:
                return None
            field_data = {}
            for subfield_name, subfield in six.iteritems(field.model_class.fields):
                data = self._get_data(subfield_name, subfield, field_result)
                if data is not None:
                    field_data.update(data)
            field_object = field.model_class(**field_data)
            log.debug('Created for' + field_name)
            log.debug(field_object)
            return {field_name: field_object}
        elif hasattr(field, 'field'):
            # Case that we have listtype
            # Always only takes the first found one though
            field = field.field
            field_data = self._get_data(field_name, field, result)
            if field_data is not None:
                if field_data[field_name] is None:
                    return None
                field_data = [field_data[field_name]]
            elif field_data is None and field.required and not field.contextual:
                raise TypeError('Could not find element for ' + str(field_name))
            elif field_data is None:
                return None
            return {field_name: field_data}
        else:
            field_result = first(result.xpath('./' + field_name + '/text()'))
            if field_result is None and field.required and not field.contextual:
                raise TypeError('Could not find element for ' + str(field_name))
            return {field_name: field_result}


[docs]class AutoSentenceParser(BaseAutoParser, BaseSentenceParser):

[docs]    def __init__(self, lenient=False, chem_name=(cem | chemical_label | lenient_chemical_label)):
        super(AutoSentenceParser, self).__init__()
        self.lenient = lenient
        self.chem_name = chem_name

    @property
    def trigger_phrase(self):
        # Generalised case of trigger_phrase. We go through the fields of the model and
        # try to find one that is both required and not contextual, and remember the name
        # of that field so that the trigger_phrase will be that parse expression next time it's called
        # If none of these are found, trigger_property is set to False, and None is returned.
        if self._trigger_property is False:
            return None
        elif self._trigger_property is not None:
            return self.model.fields[self._trigger_property].parse_expression
        else:
            for field_name, field in six.iteritems(self.model.fields):
                if field.required and not field.contextual:
                    self._trigger_property = field_name
                    return self.model.fields[self._trigger_property].parse_expression
            if self._trigger_property is None:
                self._trigger_property = False
                return None

    @property
    def root(self):
        # is always found, our models currently rely on the compound
        chem_name = self.chem_name
        compound_model = self.model.compound.model_class
        labels = compound_model.labels.parse_expression('labels')
        entities = [labels]

        if hasattr(self.model, 'dimensions') and not self.model.dimensions:
            # the mandatory elements of Dimensionless model are grouped into a entities list
            specifier = self.model.specifier.parse_expression('specifier')
            value_phrase = value_element_plain()
            entities.append(specifier)
            entities.append(value_phrase)

        elif hasattr(self.model, 'dimensions') and self.model.dimensions:
            # the mandatory elements of Quantity model are grouped into a entities list
            # print(self.model, self.model.dimensions)
            unit_element = Group(
                construct_unit_element(self.model.dimensions).with_condition(match_dimensions_of(self.model))('raw_units'))
            specifier = self.model.specifier.parse_expression('specifier')
            if self.lenient:
                value_phrase = (value_element(unit_element) | value_element_plain())
            else:
                value_phrase = value_element(unit_element)

            entities.append(specifier)
            entities.append(value_phrase)

        elif hasattr(self.model, 'specifier'):
            # now we are parsing an element that has no value but some custom string
            # therefore, there will be no matching interpret function, all entities are custom except for the specifier
            specifier = self.model.specifier.parse_expression('specifier')
            entities.append(specifier)

        # the optional, user-defined, entities of the model are added, they are tagged with the name of the field
        for field in self.model.fields:
            if field not in ['raw_value', 'raw_units', 'value', 'units', 'error', 'specifier']:
                if self.model.__getattribute__(self.model, field).parse_expression is not None:
                    entities.append(self.model.__getattribute__(self.model, field).parse_expression(field))

        # the chem_name has to be parsed last in order to avoid a conflict with other elements of the model
        entities.append(chem_name)

        # logic for finding all the elements in any order
        combined_entities = create_entities_list(entities)
        root_phrase = OneOrMore(combined_entities + Optional(SkipTo(combined_entities)))('root_phrase')
        return root_phrase


[docs]class AutoTableParser(BaseAutoParser, BaseTableParser):
    """ Additions for automated parsing of tables"""

[docs]    def __init__(self, chem_name=(cem | chemical_label | lenient_chemical_label)):
        super(AutoTableParser, self).__init__()
        self.chem_name = chem_name
    @property
    def root(self):
        # is always found, our models currently rely on the compound
        chem_name = self.chem_name
        compound_model = self.model.compound.model_class
        labels = compound_model.labels.parse_expression('labels')
        entities = [labels]
        no_value_element = W('NoValue')('raw_value')

        if hasattr(self.model, 'dimensions') and not self.model.dimensions:
            # the mandatory elements of Dimensionless model are grouped into a entities list
            specifier = self.model.specifier.parse_expression('specifier')
            value_phrase = value_element_plain() | no_value_element
            entities.append(specifier)
            entities.append(value_phrase)

        elif hasattr(self.model, 'dimensions') and self.model.dimensions:
            # the mandatory elements of Quantity model are grouped into a entities list
            # print(self.model, self.model.dimensions)
            unit_element = Group(
                construct_unit_element(self.model.dimensions).with_condition(match_dimensions_of(self.model))('raw_units'))
            specifier = self.model.specifier.parse_expression('specifier') + Optional(W('/')) + Optional(
                unit_element)
            value_phrase = ((value_element_plain() | no_value_element) + Optional(unit_element))
            entities.append(specifier)
            entities.append(value_phrase)

        elif hasattr(self.model, 'specifier'):
            # now we are parsing an element that has no value but some custom string
            # therefore, there will be no matching interpret function, all entities are custom except for the specifier
            specifier = self.model.specifier.parse_expression('specifier')
            entities.append(specifier)

        # the optional, user-defined, entities of the model are added, they are tagged with the name of the field
        for field in self.model.fields:
            if field not in ['raw_value', 'raw_units', 'value', 'units', 'error', 'specifier']:
                if self.model.__getattribute__(self.model, field).parse_expression is not None:
                    entities.append(self.model.__getattribute__(self.model, field).parse_expression(field))

        # the chem_name has to be parsed last in order to avoid a conflict with other elements of the model
        entities.append(chem_name)

        # logic for finding all the elements in any order
        combined_entities = create_entities_list(entities)
        root_phrase = OneOrMore(combined_entities + Optional(SkipTo(combined_entities)))('root_phrase')
        return root_phrase