Source code for chemdataextractor.parse.cem

# -*- coding: utf-8 -*-
"""
Chemical entity mention parser elements.
..codeauthor:: Matt Swain ([email protected])
..codeauthor:: Callum Court ([email protected])

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from abc import abstractproperty, abstractmethod
import logging
import re
from lxml import etree

from .actions import join, fix_whitespace, merge
from .common import roman_numeral, cc, nnp, hyph, nns, nn, cd, ls, optdelim, rbrct, lbrct, sym, jj, hyphen, quote, \
    dt, delim
from .base import BaseSentenceParser, BaseTableParser
from .elements import I, R, W, T, ZeroOrMore, Optional, Not, Group, End, Start, OneOrMore, Any, SkipTo, Every
from .cem_factory import _CemFactory

log = logging.getLogger(__name__)


# The below are all just for backwards compatibility - may be worth removing at some point...
# See the cem_factory file for more information on what each of these mean
default_cem_factory = _CemFactory.with_default_configuration()

icm = default_cem_factory.icm
bcm = default_cem_factory.bcm
joining_characters = default_cem_factory.joining_characters
cm = default_cem_factory.cm

alphanumeric = default_cem_factory.alphanumeric
lenient_alphanumeric = default_cem_factory.lenient_alphanumeric

numeric = default_cem_factory.numeric
lenient_numeric = default_cem_factory.lenient_numeric

letter_number = default_cem_factory.letter_number
lenient_letter_number = default_cem_factory.lenient_letter_number

cm_blocklist = default_cem_factory.cm_blocklist

exclude_prefix = default_cem_factory.exclude_prefix

comma = default_cem_factory.comma
colon = default_cem_factory.colon

include_prefix = default_cem_factory.include_prefix

label_type = default_cem_factory.label_type

synthesis_of = default_cem_factory.synthesis_of

to_give = default_cem_factory.to_give

label_blocklist = default_cem_factory.label_blocklist

prefixed_label = default_cem_factory.prefixed_label

strict_chemical_label = default_cem_factory.strict_chemical_label

lenient_chemical_label = default_cem_factory.lenient_chemical_label

very_lenient_chemical_label = default_cem_factory.very_lenient_chemical_label

chemical_label = default_cem_factory.chemical_label

chemical_label_phrase1 = default_cem_factory.chemical_label_phrase1
chemical_label_phrase2 = default_cem_factory.chemical_label_phrase2
chemical_label_phrase3 = default_cem_factory.chemical_label_phrase3

doped_chemical_identifier = default_cem_factory.doped_chemical_identifier
doping_value = default_cem_factory.doping_value
doping_range = default_cem_factory.doping_range


doping_label_1 = default_cem_factory.doping_label_1
doping_label_2 = default_cem_factory.doping_label_2

doped_chemical_label = default_cem_factory.doped_chemical_label
chemical_label_phrase = default_cem_factory.chemical_label_phrase

informal_chemical_symbol = default_cem_factory.informal_chemical_symbol

metals = default_cem_factory.metals
transition_metals = default_cem_factory.transition_metals
lanthanides = default_cem_factory.lanthanides
ion_symbol = default_cem_factory.ion_symbol
other_symbol = default_cem_factory.other_symbol

informal_values = default_cem_factory.informal_values

informal_chemical_label_1 = default_cem_factory.informal_chemical_label_1
informal_chemical_label_2 = default_cem_factory.informal_chemical_label_2

informal_chemical_label = default_cem_factory.informal_chemical_label
chemical_label_phrase = default_cem_factory.chemical_label_phrase

element_name = default_cem_factory.element_name

element_symbol = default_cem_factory.element_symbol

registry_number = default_cem_factory.registry_number

amino_acid = default_cem_factory.amino_acid

amino_acid_name = default_cem_factory.amino_acid_name

formula = default_cem_factory.formula

solvent_formula = default_cem_factory.solvent_formula

nmr_solvent = default_cem_factory.nmr_solvent

other_solvent = default_cem_factory.other_solvent

solvent_name_options = default_cem_factory.solvent_name_options
solvent_name = default_cem_factory.solvent_name
chemical_name_blocklist = default_cem_factory.chemical_name_blocklist
proper_chemical_name_options = default_cem_factory.proper_chemical_name_options

mixture_component = default_cem_factory.mixture_component
mixture_phrase = default_cem_factory.mixture_phrase

chemical_name_options = default_cem_factory.chemical_name_options

chemical_name = default_cem_factory.chemical_name

likely_abbreviation = default_cem_factory.likely_abbreviation

lenient_name = default_cem_factory.lenient_name

label_name_cem = default_cem_factory.label_name_cem
labelled_as = default_cem_factory.labelled_as
optquote = default_cem_factory.optquote

name_with_optional_bracketed_label = default_cem_factory.name_with_optional_bracketed_label

label_before_name = default_cem_factory.label_before_name
lenient_name_with_bracketed_label = default_cem_factory.lenient_name_with_bracketed_label

name_with_comma_within = default_cem_factory.name_with_comma_within

name_with_doped_label = default_cem_factory.name_with_doped_label

name_with_informal_label = default_cem_factory.name_with_informal_label

cem = default_cem_factory.cem

cem_phrase = default_cem_factory.cem_phrase

r_equals = default_cem_factory.r_equals
of_table = default_cem_factory.of_table

bracketed_after_name = default_cem_factory.bracketed_after_name
comma_after_name = default_cem_factory.comma_after_name

compound_heading_ending = default_cem_factory.compound_heading_ending

# Section number, to allow at the start of a heading
section_no = default_cem_factory.section_no

compound_heading_style1 = default_cem_factory.compound_heading_style1
compound_heading_style2 = default_cem_factory.compound_heading_style2
compound_heading_style3 = default_cem_factory.compound_heading_style3
compound_heading_style4 = default_cem_factory.compound_heading_style4
compound_heading_style5 = default_cem_factory.compound_heading_style5
compound_heading_style6 = default_cem_factory.compound_heading_style6
# TODO: Capture label type in output

compound_heading_phrase = default_cem_factory.compound_heading_phrase

names_only = default_cem_factory.names_only

labels_only = default_cem_factory.labels_only

roles_only = default_cem_factory.roles_only


[docs]def standardize_role(role): """Convert role text into standardized form.""" role = role.lower() if any(c in role for c in {'synthesis', 'give', 'yield', 'afford', 'product', 'preparation of'}): return 'product' return role
# TODO jm2111, Problems here! The parsers don't have a parse method anymore. Ruins parsing of captions.
[docs]class CompoundParser(BaseSentenceParser): """Chemical name possibly with an associated label.""" _label = None _root_phrase = None @property def root(self): label = self.model.labels.parse_expression('labels') label_name_cem = (label + optdelim + chemical_name)('compound') label_before_name = Optional(synthesis_of | to_give) + label_type + optdelim + label_name_cem + ZeroOrMore(optdelim + cc + optdelim + label_name_cem) name_with_optional_bracketed_label = (Optional(synthesis_of | to_give) + chemical_name + Optional(lbrct + Optional(labelled_as + optquote) + (label) + optquote + rbrct))('compound') # Very lenient name and label match, with format like "name (Compound 3)" lenient_name_with_bracketed_label = (Start() + Optional(synthesis_of) + lenient_name + lbrct + label_type.hide() + label + rbrct)('compound') # Chemical name with a doped label after # name_with_doped_label = (chemical_name + OneOrMore(delim | I('with') | I('for')) + label)('compound') # Chemical name with an informal label after # name_with_informal_label = (chemical_name + Optional(R('compounds?')) + OneOrMore(delim | I('with') | I('for')) + informal_chemical_label)('compound') return Group(name_with_informal_label | name_with_doped_label | lenient_name_with_bracketed_label | label_before_name | name_with_comma_within | name_with_optional_bracketed_label)('cem_phrase')
[docs] def interpret(self, result, start, end): # TODO: Parse label_type into label model object # print(etree.tostring(result)) for cem_el in result.xpath('./compound'): c = self.model( names=cem_el.xpath('./names/text()'), labels=cem_el.xpath('./labels/text()'), roles=[standardize_role(r) for r in cem_el.xpath('./roles/text()')] ) c.record_method = self.__class__.__name__ yield c
[docs]class ChemicalLabelParser(BaseSentenceParser): """Chemical label occurrences with no associated name.""" _label = None _root_phrase = None @property def root(self): label = self.model.labels.parse_expression('labels') if self._label is label: return self._root_phrase self._root_phrase = (chemical_label_phrase | Group(label)('chemical_label_phrase')) self._label = label return self._root_phrase
[docs] def interpret(self, result, start, end): # print(etree.tostring(result)) roles = [standardize_role(r) for r in result.xpath('./roles/text()')] for label in result.xpath('./labels/text()'): yield self.model(labels=[label], roles=roles)
[docs]class CompoundHeadingParser(BaseSentenceParser): """Better matching of abbreviated names in dedicated compound headings.""" root = compound_heading_phrase parse_full_sentence = True
[docs] def interpret(self, result, start, end): roles = [standardize_role(r) for r in result.xpath('./roles/text()')] labels = result.xpath('./labels/text()') if len(labels) > 1: for label in labels: yield self.model(labels=[label], roles=roles) for name in result.xpath('./names/text()'): yield self.model(names=[name], roles=roles) else: yield self.model( names=result.xpath('./names/text()'), labels=labels, roles=roles )
[docs]class CompoundTableParser(BaseTableParser): entities = (cem | chemical_label | lenient_chemical_label) | ((I('Formula') | I('Compound')).add_action(join))('specifier') root = OneOrMore(entities + Optional(SkipTo(entities)))('root_phrase') @property def root(self): # is always found, our models currently rely on the compound chem_name = (cem | chemical_label | lenient_chemical_label) compound_model = self.model labels = compound_model.labels.parse_expression('labels') entities = [labels] specifier = (I('Formula') | I('Compound') | I('Alloy')).add_action(join)('specifier') entities.append(specifier) # the optional, user-defined, entities of the model are added, they are tagged with the name of the field for field in self.model.fields: if field not in ['raw_value', 'raw_units', 'value', 'units', 'error', 'specifier']: if self.model.__getattribute__(self.model, field).parse_expression is not None: entities.append(self.model.__getattribute__(self.model, field).parse_expression(field)) # the chem_name has to be parsed last in order to avoid a conflict with other elements of the model entities.append(chem_name) # logic for finding all the elements in any order combined_entities = entities[0] for entity in entities[1:]: combined_entities = (combined_entities | entity) root_phrase = OneOrMore(combined_entities + Optional(SkipTo(combined_entities)))('root_phrase') self._root_phrase = root_phrase self._specifier = self.model.specifier return root_phrase
[docs] def interpret(self, result, start, end): # TODO: Parse label_type into label model object if result.xpath('./specifier/text()') and \ (result.xpath('./names/names/text()') or result.xpath('./labels/text()')): c = self.model( names=result.xpath('./names/names/text()'), labels=result.xpath('./labels/text()'), roles=[standardize_role(r) for r in result.xpath('./roles/text()')] ) if c is not None: c.record_method = self.__class__.__name__ yield c