Source code for chemdataextractor.parse.auto

# -*- coding: utf-8 -*-
"""
Parser for automatic parsing, without user-written parsing rules.
Mainly used for tables.

Models must be constructed in a certain way for them to work optimally with autoparsers. Namely, they should have:

- A specifier field with an associated parse expression (Optional, only required if autoparsers are desired). These parse expressions will be updated automatically using forward-looking Interdependency Resolution if the updatable flag is set to True.
- These specifiers should also have required set to True so that spurious matches are not found.
- If applicable, a compound entity, named compound.

Any parse_expressions set in the model should have an added action to ensure that the results are a single word. An example would be to call add_action(join) on each parse expression.

.. codeauthor:: Taketomo Isazawa <[email protected]>

.. codeauthor:: Juraj Mavračić <[email protected]>
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import six
import copy

from .cem import cem, chemical_label, lenient_chemical_label
from .actions import merge, join
from .elements import W, I, R, T, Optional, Any, OneOrMore, Not, ZeroOrMore, Group, SkipTo, Or, NoMatch
from ..utils import first
from .quantity import magnitudes_dict, value_element, extract_units, lbrct, rbrct
from .base import BaseSentenceParser, BaseParser, BaseTableParser

from lxml.builder import E
import xml.etree.ElementTree as etree

log = logging.getLogger(__name__)


[docs]def construct_unit_element(dimensions):
    """
    Construct an element for detecting units for the dimensions given.
    Any magnitude modifiers (e.g. kilo) will be automatically handled.

    :param Dimension dimensions: The dimensions that the element produced will look for.
    :returns: An Element to look for units of given dimensions. If None or Dimensionless are passed in, returns None.
    :rtype: BaseParserElement or None
    """
    if not dimensions or not dimensions.units_dict:
        return None
    # Handle all the magnitudes
    units_regex = '^(('
    for element in magnitudes_dict.keys():
        units_regex += '(' + element.pattern + ')|'
    units_regex = units_regex[:-1]
    units_regex += ')?'
    units_regex += '('
    # Case where we have a token that's just brackets
    units_regex += r'((\(|\[))|((\)|\]))|\-|'
    # Handle all the units
    for element in dimensions.units_dict:
        units_regex += '(' + element.pattern + ')|'
    units_regex += r'(\/)'
    # Case when we have powers, or one or more units
    units_regex2 = units_regex + r'|([\+\-–−]?\d+(\.\d+)?)'
    units_regex2 += '))+$'
    units_regex += '))+'
    units_regex += (units_regex2[1:-2] + '*')
    units_regex += '$'
    return (R(pattern=units_regex) + ZeroOrMore(R(pattern=units_regex) | R(pattern=units_regex2))).add_action(_clean_units_results)


def _clean_units_results(tokens, start, result):
    """
    Action to remove unmatched brackets
    """
    # Configure the following for all bracket types that should be matched
    brackets = {"{": 0, "(": 0, "[": 0}
    bracket_matches = {"}": "{", ")": "(", "]": "["}

    texts = []
    if len(result) > 0:
        # Get text for each element
        for e in result:
            for child in e.iter():
                if child.text is not None:
                    texts.append(child.text)

        # Check whether brackets are matched; each opening bracket increments the count
        # stored in brackets by one, and each closing one decrements the count.
        for text in texts:
            if text in brackets:
                brackets[text] += 1
            elif text in bracket_matches:
                brackets[bracket_matches[text]] -= 1

        # For each type of bracket, we clean the unmatched brackets
        cleaned_texts = copy.copy(texts)
        for bracket_type, value in brackets.items():

            # Strip opening brackets
            if value > 0:
                count = 0
                new_cleaned_texts = []
                for el in cleaned_texts:
                    if el == bracket_type and count < value:
                        count += 1
                    else:
                        new_cleaned_texts.append(el)
                cleaned_texts = new_cleaned_texts

            # Strip closing brackets
            elif value < 0:
                count = 0
                reversed_texts = reversed(cleaned_texts)
                new_cleaned_texts = []
                bracket = None

                for closing_bracket, opening_bracket in bracket_matches.items():
                    if opening_bracket == bracket_type:
                        bracket = closing_bracket
                        break

                for el in reversed_texts:
                    if el == bracket and count < abs(value):
                        count += 1
                    else:
                        new_cleaned_texts.append(el)
                cleaned_texts = list(reversed(new_cleaned_texts))

        new_text = ''.join(cleaned_texts)
        if new_text[-1] in ["-", "–", "−"]:
            new_text = new_text[:-1]

        return [E(result[0].tag, new_text)]


[docs]def construct_category_element(category_dict):
    """
    Construct an element for detecting categories.

    :param Category category: The Category to look for.
    :rtype: BaseParserElement or None
    """
    category_regex = '^'
    if not category_dict:
        return None
    # Handle all the units
    for element in category_dict:
        category_regex += '(' + element.pattern + ')|'
    category_regex = category_regex[:-1]
    category_regex += '$'
    return (R(pattern=category_regex))('raw_value').add_action(merge)


[docs]def match_dimensions_of(model):
    """
    Produces a function that checks whether the given results of parsing match the
    dimensions of the model provided.

    :param QuantityModel model: The model with which to check dimensions.
    :returns: A function which will return True if the results of parsing match the model's dimensions, False if not.
    :rtype: function(tuple(list(Element), int) -> bool)
    """
    def check_match(result):
        try:
            extract_units(result[0].text, model.dimensions, strict=True)
            return True
        except TypeError as e:
            log.debug(e)
            return False
    return check_match


[docs]def create_entities_list(entities):
    """
    For a list of Base parser entities, creates an entity of structure. For example, with 4 entities in the list, the output is::

        (entities[0] | entities[1] | entities[2] | entities[3])

    :param entities: BaseParserElement type objects
    :return: BaseParserElement type object
    """
    result = entities[0]
    for entity in entities[1:]:
        result = (result | entity)
    return result


[docs]class BaseAutoParser(BaseParser):
    model = None
    _specifier = None
    _root_phrase = None

[docs]    def __init__(self):
        super(BaseAutoParser, self).__init__()
        self._trigger_property = None

[docs]    def interpret(self, results, start, end):
        if results is None:
            return

        if not isinstance(results, list):
            results = [results]

        for result in results:
            property_entities = {}

            if hasattr(self.model, 'dimensions') and not self.model.dimensions:
                # the specific entities of a DimensionlessModel are retrieved explicitly and packed into a dictionary
                raw_value = first(self._get_data_for_field(result, "raw_value", True))
                log.debug(raw_value)
                if not raw_value and self.model.fields['raw_value'].required and not self.model.fields['raw_value'].contextual:
                    requirements = False
                property_entities.update({"raw_value": raw_value})

            elif hasattr(self.model, 'dimensions') and self.model.dimensions:
                # the specific entities of a QuantityModel are retrieved explicitly and packed into a dictionary
                # print(etree.tostring(result))
                raw_value = first(self._get_data_for_field(result, "raw_value", True))
                raw_units = first(self._get_data_for_field(result, "raw_units", True))
                property_entities.update({"raw_value": raw_value,
                                        "raw_units": raw_units})

            for field_name, field in six.iteritems(self.model.fields):
                if field_name not in ['raw_value', 'raw_units', 'value', 'units', 'error']:
                    try:
                        data = self._get_data(field_name, field, result)
                        if data is not None:
                            property_entities.update(data)
                    # if field is required, but empty, the requirements have not been met
                    except TypeError as e:
                        log.debug(self.model)
                        log.debug(e)

            model_instance = None
            if property_entities.keys():
                model_instance = self.model(**property_entities)

            if model_instance and model_instance.noncontextual_required_fulfilled:
                # records the parser that was used to generate this record, can be used for evaluation
                model_instance.record_method = self.__class__.__name__
                yield model_instance

    def _get_data(self, field_name, field, result, for_list=False):
        if hasattr(field, 'model_class'):
            if for_list:
                field_results = self._get_data_for_field(result, field_name)
            else:
                field_results = [first(self._get_data_for_field(result, field_name))]
            field_objects = []
            for field_result in field_results:
                if field_result is None and field.required and not field.contextual and field.requiredness == 1.0:
                    raise TypeError('Could not find element for ' + str(field_name))
                elif field_result is None:
                    continue
                field_data = {}
                for subfield_name, subfield in six.iteritems(field.model_class.fields):
                    data = self._get_data(subfield_name, subfield, field_result, for_list=False)
                    if data:
                        field_data.update(data)
                field_object = None
                if field_data.keys():
                    field_object = field.model_class(**field_data)
                if field_object is not None:
                    field_objects.append(field_object)
                log.debug('Created for' + field_name)
                log.debug(field_object)
            if not for_list and field_objects:
                field_objects = field_objects[0]
            if not field_objects:
                return None
            return {field_name: field_objects}
        elif hasattr(field, 'field'):
            # Case that we have listtype
            field = field.field
            field_data = self._get_data(field_name, field, result, for_list=True)
            if not field_data or not field_data[field_name]:
                return None
            elif not field_data and field.required and not field.contextual:
                raise TypeError('Could not find element for ' + str(field_name))
            return {field_name: field_data[field_name]}
        else:
            if for_list:
                field_result = self._get_data_for_field(result, field_name, True)
            else:
                field_result = first(self._get_data_for_field(result, field_name, True))
            if field_result is None or field_result == []:
                if field.required and not field.contextual and field.requiredness == 1.0:
                    raise TypeError('Could not find element for ' + str(field_name))
                return None
            return {field_name: field_result}

    def _get_data_for_field(self, result, field_name, get_text=False):
        if get_text:
            field_name = field_name + "/text()"
        strict_result = result.xpath("./" + field_name)
        if strict_result is not None and len(strict_result):
            return strict_result
        else:
            return result.xpath("//" + field_name)


[docs]class AutoSentenceParser(BaseAutoParser, BaseSentenceParser):

[docs]    def __init__(self, lenient=False, chem_name=(cem | chemical_label), activate_to_range=False):
        super(AutoSentenceParser, self).__init__()
        self.lenient = lenient
        self.chem_name = chem_name
        self.activate_to_range = activate_to_range

    @property
    def trigger_phrase(self):
        # Generalised case of trigger_phrase. We go through the fields of the model and
        # try to find one that is both required and not contextual, and remember the name
        # of that field so that the trigger_phrase will be that parse expression next time it's called
        # If none of these are found, trigger_property is set to False, and None is returned.
        if self._trigger_property is False:
            return None
        elif self._trigger_property is not None:
            return self.model.fields[self._trigger_property].parse_expression
        else:
            for field_name, field in six.iteritems(self.model.fields):
                if field.required and field.requiredness == 1.0 and not field.contextual:
                    self._trigger_property = field_name
                    return self.model.fields[self._trigger_property].parse_expression
            if self._trigger_property is None:
                self._trigger_property = False
                return None

    @property
    def root(self):
        # is always found, our models currently rely on the compound
        chem_name = self.chem_name
        try:
            compound_model = self.model.compound.model_class
            labels = Group(compound_model.labels.parse_expression('labels'))('compound')
        except AttributeError:
            labels = NoMatch()
        entities = [labels]

        if hasattr(self.model, 'dimensions') and not self.model.dimensions:
            # the mandatory elements of Dimensionless model are grouped into a entities list
            specifier = self.model.specifier.parse_expression('specifier')
            value_phrase = value_element()
            entities.append(specifier)
            entities.append(value_phrase)

        elif hasattr(self.model, 'dimensions') and self.model.dimensions:
            # the mandatory elements of Quantity model are grouped into a entities list
            # print(self.model, self.model.dimensions)
            unit_element = Group(
                construct_unit_element(self.model.dimensions).with_condition(match_dimensions_of(self.model))('raw_units'))
            specifier = self.model.specifier.parse_expression('specifier')
            if self.lenient:
                value_phrase = (value_element(unit_element, activate_to_range=self.activate_to_range) | value_element(activate_to_range=self.activate_to_range))
            else:
                value_phrase = value_element(unit_element, activate_to_range=self.activate_to_range)

            entities.append(specifier)
            entities.append(value_phrase)

        elif hasattr(self.model, 'specifier') and self.model.specifier:
            # now we are parsing an element that has no value but some custom string
            # therefore, there will be no matching interpret function, all entities are custom except for the specifier
            specifier = self.model.specifier.parse_expression('specifier')
            entities.append(specifier)

        # the optional, user-defined, entities of the model are added, they are tagged with the name of the field
        for field in self.model.fields:
            if field not in ['raw_value', 'raw_units', 'value', 'units', 'error', 'specifier']:
                if self.model.__getattribute__(self.model, field).parse_expression is not None:
                    entities.append(self.model.__getattribute__(self.model, field).parse_expression(field))

        # the chem_name has to be parsed last in order to avoid a conflict with other elements of the model
        entities.append(chem_name)

        # logic for finding all the elements in any order
        combined_entities = create_entities_list(entities)
        root_phrase = OneOrMore(combined_entities + Optional(SkipTo(combined_entities)))('root_phrase')
        return root_phrase


[docs]class AutoTableParser(BaseAutoParser, BaseTableParser):
    """ Additions for automated parsing of tables"""

[docs]    def __init__(self, chem_name=(cem | chemical_label | lenient_chemical_label)):
        super(AutoTableParser, self).__init__()
        self.chem_name = chem_name

    @property
    def root(self):
        # is always found, our models currently rely on the compound
        chem_name = self.chem_name
        try:
            compound_model = self.model.compound.model_class
            labels = Group(compound_model.labels.parse_expression('labels'))('compound')
        except AttributeError:
            labels = NoMatch()
        entities = [labels]
        no_value_element = W('NoValue')('raw_value')

        if hasattr(self.model, 'dimensions') and not self.model.dimensions:
            # the mandatory elements of Dimensionless model are grouped into a entities list
            specifier = self.model.specifier.parse_expression('specifier')
            value_phrase = value_element() | no_value_element
            entities.append(specifier)
            entities.append(value_phrase)

        elif hasattr(self.model, 'dimensions') and self.model.dimensions:
            # the mandatory elements of Quantity model are grouped into a entities list
            # print(self.model, self.model.dimensions)
            unit_element = Group(
                construct_unit_element(self.model.dimensions).with_condition(match_dimensions_of(self.model))('raw_units'))
            specifier = self.model.specifier.parse_expression('specifier') + Optional(W('/')) + Optional(
                unit_element)
            value_phrase = ((value_element() | no_value_element) + Optional(unit_element))
            entities.append(specifier)
            entities.append(value_phrase)

        elif hasattr(self.model, 'specifier') and self.model.specifier:
            # now we are parsing an element that has no value but some custom string
            # therefore, there will be no matching interpret function, all entities are custom except for the specifier
            specifier = self.model.specifier.parse_expression('specifier')
            entities.append(specifier)

        # the optional, user-defined, entities of the model are added, they are tagged with the name of the field
        for field in self.model.fields:
            if field not in ['raw_value', 'raw_units', 'value', 'units', 'error', 'specifier']:
                if self.model.__getattribute__(self.model, field).parse_expression is not None:
                    entities.append(self.model.__getattribute__(self.model, field).parse_expression(field))

        # the chem_name has to be parsed last in order to avoid a conflict with other elements of the model
        entities.append(chem_name)

        # logic for finding all the elements in any order
        combined_entities = create_entities_list(entities)
        root_phrase = OneOrMore(combined_entities + Optional(SkipTo(combined_entities)))('root_phrase')
        return root_phrase