Source code for chemdataextractor.parse.auto

# -*- coding: utf-8 -*-
"""
Parser for automatic parsing, without user-written parsing rules.
Mainly used for tables.

Models must be constructed in a certain way for them to work optimally with autoparsers. Namely, they should have:

- A specifier field with an associated parse expression (Optional, only required if autoparsers are desired). These parse expressions will be updated automatically using forward-looking Interdependency Resolution if the updatable flag is set to True.
- These specifiers should also have required set to True so that spurious matches are not found.
- If applicable, a compound entity, named compound.

Any parse_expressions set in the model should have an added action to ensure that the results are a single word. An example would be to call add_action(join) on each parse expression.

.. codeauthor:: Taketomo Isazawa <[email protected]>

.. codeauthor:: Juraj Mavračić <[email protected]>
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import six
import copy

from .cem import cem, chemical_label, lenient_chemical_label
from .actions import merge, join
from .elements import W, I, R, T, Optional, Any, OneOrMore, Not, ZeroOrMore, Group, SkipTo, Or, NoMatch
from ..utils import first
from .quantity import magnitudes_dict, value_element, extract_units, lbrct, rbrct
from .base import BaseSentenceParser, BaseParser, BaseTableParser

from lxml.builder import E
import xml.etree.ElementTree as etree

log = logging.getLogger(__name__)


[docs]def construct_unit_element(dimensions): """ Construct an element for detecting units for the dimensions given. Any magnitude modifiers (e.g. kilo) will be automatically handled. :param Dimension dimensions: The dimensions that the element produced will look for. :returns: An Element to look for units of given dimensions. If None or Dimensionless are passed in, returns None. :rtype: BaseParserElement or None """ if not dimensions or not dimensions.units_dict: return None # Handle all the magnitudes units_regex = '^((' for element in magnitudes_dict.keys(): units_regex += '(' + element.pattern + ')|' units_regex = units_regex[:-1] units_regex += ')?' units_regex += '(' # Case where we have a token that's just brackets units_regex += r'((\(|\[))|((\)|\]))|\-|' # Handle all the units for element in dimensions.units_dict: units_regex += '(' + element.pattern + ')|' units_regex += r'(\/)' # Case when we have powers, or one or more units units_regex2 = units_regex + r'|([\+\-–−]?\d+(\.\d+)?)' units_regex2 += '))+$' units_regex += '))+' units_regex += (units_regex2[1:-2] + '*') units_regex += '$' return (R(pattern=units_regex) + ZeroOrMore(R(pattern=units_regex) | R(pattern=units_regex2))).add_action(_clean_units_results)
def _clean_units_results(tokens, start, result): """ Action to remove unmatched brackets """ # Configure the following for all bracket types that should be matched brackets = {"{": 0, "(": 0, "[": 0} bracket_matches = {"}": "{", ")": "(", "]": "["} texts = [] if len(result) > 0: # Get text for each element for e in result: for child in e.iter(): if child.text is not None: texts.append(child.text) # Check whether brackets are matched; each opening bracket increments the count # stored in brackets by one, and each closing one decrements the count. for text in texts: if text in brackets: brackets[text] += 1 elif text in bracket_matches: brackets[bracket_matches[text]] -= 1 # For each type of bracket, we clean the unmatched brackets cleaned_texts = copy.copy(texts) for bracket_type, value in brackets.items(): # Strip opening brackets if value > 0: count = 0 new_cleaned_texts = [] for el in cleaned_texts: if el == bracket_type and count < value: count += 1 else: new_cleaned_texts.append(el) cleaned_texts = new_cleaned_texts # Strip closing brackets elif value < 0: count = 0 reversed_texts = reversed(cleaned_texts) new_cleaned_texts = [] bracket = None for closing_bracket, opening_bracket in bracket_matches.items(): if opening_bracket == bracket_type: bracket = closing_bracket break for el in reversed_texts: if el == bracket and count < abs(value): count += 1 else: new_cleaned_texts.append(el) cleaned_texts = list(reversed(new_cleaned_texts)) new_text = ''.join(cleaned_texts) if new_text[-1] in ["-", "–", "−"]: new_text = new_text[:-1] return [E(result[0].tag, new_text)]
[docs]def construct_category_element(category_dict): """ Construct an element for detecting categories. :param Category category: The Category to look for. :rtype: BaseParserElement or None """ category_regex = '^' if not category_dict: return None # Handle all the units for element in category_dict: category_regex += '(' + element.pattern + ')|' category_regex = category_regex[:-1] category_regex += '$' return (R(pattern=category_regex))('raw_value').add_action(merge)
[docs]def match_dimensions_of(model): """ Produces a function that checks whether the given results of parsing match the dimensions of the model provided. :param QuantityModel model: The model with which to check dimensions. :returns: A function which will return True if the results of parsing match the model's dimensions, False if not. :rtype: function(tuple(list(Element), int) -> bool) """ def check_match(result): try: extract_units(result[0].text, model.dimensions, strict=True) return True except TypeError as e: log.debug(e) return False return check_match
[docs]def create_entities_list(entities): """ For a list of Base parser entities, creates an entity of structure. For example, with 4 entities in the list, the output is:: (entities[0] | entities[1] | entities[2] | entities[3]) :param entities: BaseParserElement type objects :return: BaseParserElement type object """ result = entities[0] for entity in entities[1:]: result = (result | entity) return result
[docs]class BaseAutoParser(BaseParser): model = None _specifier = None _root_phrase = None
[docs] def __init__(self): super(BaseAutoParser, self).__init__() self._trigger_property = None
[docs] def interpret(self, results, start, end): if results is None: return if not isinstance(results, list): results = [results] for result in results: property_entities = {} if hasattr(self.model, 'dimensions') and not self.model.dimensions: # the specific entities of a DimensionlessModel are retrieved explicitly and packed into a dictionary raw_value = first(self._get_data_for_field(result, "raw_value", True)) log.debug(raw_value) if not raw_value and self.model.fields['raw_value'].required and not self.model.fields['raw_value'].contextual: requirements = False property_entities.update({"raw_value": raw_value}) elif hasattr(self.model, 'dimensions') and self.model.dimensions: # the specific entities of a QuantityModel are retrieved explicitly and packed into a dictionary # print(etree.tostring(result)) raw_value = first(self._get_data_for_field(result, "raw_value", True)) raw_units = first(self._get_data_for_field(result, "raw_units", True)) property_entities.update({"raw_value": raw_value, "raw_units": raw_units}) for field_name, field in six.iteritems(self.model.fields): if field_name not in ['raw_value', 'raw_units', 'value', 'units', 'error']: try: data = self._get_data(field_name, field, result) if data is not None: property_entities.update(data) # if field is required, but empty, the requirements have not been met except TypeError as e: log.debug(self.model) log.debug(e) model_instance = None if property_entities.keys(): model_instance = self.model(**property_entities) if model_instance and model_instance.noncontextual_required_fulfilled: # records the parser that was used to generate this record, can be used for evaluation model_instance.record_method = self.__class__.__name__ yield model_instance
def _get_data(self, field_name, field, result, for_list=False): if hasattr(field, 'model_class'): if for_list: field_results = self._get_data_for_field(result, field_name) else: field_results = [first(self._get_data_for_field(result, field_name))] field_objects = [] for field_result in field_results: if field_result is None and field.required and not field.contextual and field.requiredness == 1.0: raise TypeError('Could not find element for ' + str(field_name)) elif field_result is None: continue field_data = {} for subfield_name, subfield in six.iteritems(field.model_class.fields): data = self._get_data(subfield_name, subfield, field_result, for_list=False) if data: field_data.update(data) field_object = None if field_data.keys(): field_object = field.model_class(**field_data) if field_object is not None: field_objects.append(field_object) log.debug('Created for' + field_name) log.debug(field_object) if not for_list and field_objects: field_objects = field_objects[0] if not field_objects: return None return {field_name: field_objects} elif hasattr(field, 'field'): # Case that we have listtype field = field.field field_data = self._get_data(field_name, field, result, for_list=True) if not field_data or not field_data[field_name]: return None elif not field_data and field.required and not field.contextual: raise TypeError('Could not find element for ' + str(field_name)) return {field_name: field_data[field_name]} else: if for_list: field_result = self._get_data_for_field(result, field_name, True) else: field_result = first(self._get_data_for_field(result, field_name, True)) if field_result is None or field_result == []: if field.required and not field.contextual and field.requiredness == 1.0: raise TypeError('Could not find element for ' + str(field_name)) return None return {field_name: field_result} def _get_data_for_field(self, result, field_name, get_text=False): if get_text: field_name = field_name + "/text()" strict_result = result.xpath("./" + field_name) if strict_result is not None and len(strict_result): return strict_result else: return result.xpath("//" + field_name)
[docs]class AutoSentenceParser(BaseAutoParser, BaseSentenceParser):
[docs] def __init__(self, lenient=False, chem_name=(cem | chemical_label), activate_to_range=False): super(AutoSentenceParser, self).__init__() self.lenient = lenient self.chem_name = chem_name self.activate_to_range = activate_to_range
@property def trigger_phrase(self): # Generalised case of trigger_phrase. We go through the fields of the model and # try to find one that is both required and not contextual, and remember the name # of that field so that the trigger_phrase will be that parse expression next time it's called # If none of these are found, trigger_property is set to False, and None is returned. if self._trigger_property is False: return None elif self._trigger_property is not None: return self.model.fields[self._trigger_property].parse_expression else: for field_name, field in six.iteritems(self.model.fields): if field.required and field.requiredness == 1.0 and not field.contextual: self._trigger_property = field_name return self.model.fields[self._trigger_property].parse_expression if self._trigger_property is None: self._trigger_property = False return None @property def root(self): # is always found, our models currently rely on the compound chem_name = self.chem_name try: compound_model = self.model.compound.model_class labels = Group(compound_model.labels.parse_expression('labels'))('compound') except AttributeError: labels = NoMatch() entities = [labels] if hasattr(self.model, 'dimensions') and not self.model.dimensions: # the mandatory elements of Dimensionless model are grouped into a entities list specifier = self.model.specifier.parse_expression('specifier') value_phrase = value_element() entities.append(specifier) entities.append(value_phrase) elif hasattr(self.model, 'dimensions') and self.model.dimensions: # the mandatory elements of Quantity model are grouped into a entities list # print(self.model, self.model.dimensions) unit_element = Group( construct_unit_element(self.model.dimensions).with_condition(match_dimensions_of(self.model))('raw_units')) specifier = self.model.specifier.parse_expression('specifier') if self.lenient: value_phrase = (value_element(unit_element, activate_to_range=self.activate_to_range) | value_element(activate_to_range=self.activate_to_range)) else: value_phrase = value_element(unit_element, activate_to_range=self.activate_to_range) entities.append(specifier) entities.append(value_phrase) elif hasattr(self.model, 'specifier') and self.model.specifier: # now we are parsing an element that has no value but some custom string # therefore, there will be no matching interpret function, all entities are custom except for the specifier specifier = self.model.specifier.parse_expression('specifier') entities.append(specifier) # the optional, user-defined, entities of the model are added, they are tagged with the name of the field for field in self.model.fields: if field not in ['raw_value', 'raw_units', 'value', 'units', 'error', 'specifier']: if self.model.__getattribute__(self.model, field).parse_expression is not None: entities.append(self.model.__getattribute__(self.model, field).parse_expression(field)) # the chem_name has to be parsed last in order to avoid a conflict with other elements of the model entities.append(chem_name) # logic for finding all the elements in any order combined_entities = create_entities_list(entities) root_phrase = OneOrMore(combined_entities + Optional(SkipTo(combined_entities)))('root_phrase') return root_phrase
[docs]class AutoTableParser(BaseAutoParser, BaseTableParser): """ Additions for automated parsing of tables"""
[docs] def __init__(self, chem_name=(cem | chemical_label | lenient_chemical_label)): super(AutoTableParser, self).__init__() self.chem_name = chem_name
@property def root(self): # is always found, our models currently rely on the compound chem_name = self.chem_name try: compound_model = self.model.compound.model_class labels = Group(compound_model.labels.parse_expression('labels'))('compound') except AttributeError: labels = NoMatch() entities = [labels] no_value_element = W('NoValue')('raw_value') if hasattr(self.model, 'dimensions') and not self.model.dimensions: # the mandatory elements of Dimensionless model are grouped into a entities list specifier = self.model.specifier.parse_expression('specifier') value_phrase = value_element() | no_value_element entities.append(specifier) entities.append(value_phrase) elif hasattr(self.model, 'dimensions') and self.model.dimensions: # the mandatory elements of Quantity model are grouped into a entities list # print(self.model, self.model.dimensions) unit_element = Group( construct_unit_element(self.model.dimensions).with_condition(match_dimensions_of(self.model))('raw_units')) specifier = self.model.specifier.parse_expression('specifier') + Optional(W('/')) + Optional( unit_element) value_phrase = ((value_element() | no_value_element) + Optional(unit_element)) entities.append(specifier) entities.append(value_phrase) elif hasattr(self.model, 'specifier') and self.model.specifier: # now we are parsing an element that has no value but some custom string # therefore, there will be no matching interpret function, all entities are custom except for the specifier specifier = self.model.specifier.parse_expression('specifier') entities.append(specifier) # the optional, user-defined, entities of the model are added, they are tagged with the name of the field for field in self.model.fields: if field not in ['raw_value', 'raw_units', 'value', 'units', 'error', 'specifier']: if self.model.__getattribute__(self.model, field).parse_expression is not None: entities.append(self.model.__getattribute__(self.model, field).parse_expression(field)) # the chem_name has to be parsed last in order to avoid a conflict with other elements of the model entities.append(chem_name) # logic for finding all the elements in any order combined_entities = create_entities_list(entities) root_phrase = OneOrMore(combined_entities + Optional(SkipTo(combined_entities)))('root_phrase') return root_phrase