Source code for chemdataextractor.parse.nmr

# -*- coding: utf-8 -*-
"""
NMR text parser.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import copy
import logging
import re


from ..utils import first
from .actions import join, merge, strip_stop, fix_whitespace
from .base import BaseParser
from .common import cc, equals
from .cem import chemical_name, nmr_solvent
from .elements import W, I, T, R, Optional, ZeroOrMore, SkipTo, OneOrMore, Not, Group

log = logging.getLogger(__name__)


number = R('^\d+(\.\d+)?$')

nucleus = (
    (W('13C') + W('{') + W('1H') + W('}')) |
    W('1H') | W('13C') | W('15N') | W('31P') | W('19F') | W('11B') | W('29Si') | W('17O') | W('73Ge') | W('195Pt') |
    W('33S') | W('13C{1H') + W('}') | W('H1') | W('C13') | W('N15') | W('P31') | W('F19') | W('B11') |
    W('Si29') | W('Ge73') | W('Pt195') | W('S33')
)('nucleus').add_action(merge)

nmr_name = R('^N\.?M\.?R\.?\(?$', re.I).hide()

nmr_name_with_nucleus = R('^(1H|13C)N\.?M\.?R\.?\(?$', re.I, group=1)('nucleus')

frequency = (number('value') + R('^M?Hz$')('units'))('frequency')

delim = R('^[;:,\./]$').hide()

solvent = ((nmr_solvent | chemical_name) + Optional((R('^(\+|&|and)$') | cc) + (nmr_solvent | chemical_name)) + Optional(SkipTo(R('^([;:,\.\)]|at)$'))) + Optional(Optional(delim) + I('solvent').hide()))('solvent').add_action(join).add_action(fix_whitespace)

temp_value = (Optional(R('^[~∼\<\>]$')) + Optional(R('^[\-–−]$')) + R('^[\+\-–−]?\d+(\.\d+)?$'))('value').add_action(merge)
temp_word = (I('room') + R('^temp(erature)?$') | R('^r\.?t\.?$', re.I))('value').add_action(join)
temp_units = (W('°') + R('[CFK]') | W('K'))('units').add_action(merge)
temperature = Optional(I('at').hide()) + Group((temp_value + temp_units) | temp_word)('temperature')


[docs]def fix_nmr_peak_whitespace_error(tokens, start, result): """""" new_result = [] for e in result: shift = e.find('shift') if ',' in shift.text: for peak_text in shift.text.split(','): new_e = copy.deepcopy(e) new_e.find('shift').text = peak_text new_result.append(new_e) else: new_result.append(e) return new_result
[docs]def strip_delta(tokens, start, result): """""" for e in result: for child in e.iter(): if child.text.startswith('δ'): child.text = child.text[1:] return result
shift_range = (Optional(R('^[\-–−‒]$')) + (R('^δ?[\+\-–−‒]?\d+(\.+\d+)?[\-–−‒]\d+(\.+\d+)?\.?$') | (R('^[\+\-–−‒]?\d+(\.+\d+)?$') + Optional(R('^[\-–−‒]$')) + R('^[\+\-–−‒]?\d+(\.+\d+)?\.?$'))))('shift').add_action(merge) shift_value = (Optional(R('^[\-–−‒]$')) + R('^δ?[\+\-–−‒]?\d+(\.+\d+)?\.?$'))('shift').add_action(merge) shift_error = (Optional(R('^[\-–−‒]$')) + R('^δ?[\+\-–−‒]?\d+(\.+\d+)?,\d+(\.+\d+)?\.?$'))('shift').add_action(merge) shift = (shift_range | shift_value | shift_error).add_action(strip_stop).add_action(strip_delta) split = R('^(br?)?(s|S|d|D|t|T|q|Q|quint|sept|m|M|dd|ddd|dt|td|tt|br|bs|sb|h|ABq|broad|singlet|doublet|triplet|qua(rtet)?|quintet|septet|multiplet|multiple|peaks)$') multiplicity = (OneOrMore(split) + Optional(W('of') + split))('multiplicity').add_action(join) coupling_value = (number + ZeroOrMore(R('^[,;&]$') + number + Not(W('H'))))('value').add_action(join) # coupling = ((R('^\d?J([HCNPFD\d,]*|cis|trans)$') + Optional(R('^[\-–−‒]$') + R('^[HCNPF\d]$')) + Optional('=')).hide() + coupling_value + Optional(W('Hz')('units')) + ZeroOrMore(R('^[,;&]$').hide() + coupling_value + W('Hz')('units')))('coupling') coupling = ((R('^\d?J([HCNPFD\d,]*|cis|trans)$') + ZeroOrMore(R('^J?([HCNPFD\d,]*|cis|trans)$')) + Optional(R('^[\-–−‒]$') + R('^[HCNPF\d]$')) + Optional('=')).hide() + coupling_value + Optional(W('Hz')('units')) + ZeroOrMore(R('^[,;&]$').hide() + coupling_value + W('Hz')('units')))('coupling') number = (R('^\d+(\.\d+)?[HCNPF]\.?$') | (R('^\d+(\.\d+)?$') + R('^[HCNPF]\.?$')))('number').add_action(merge) assignment_options = (OneOrMore(R('([CNHOPS\-–−‒=]+\d*[A-Za-z]?′*)+') | W('′') | chemical_name | R('^(C?quat\.?|Ac|Ar|Ph|linker|bridge)$')) + Optional(W('×') + R('^\d+$')))('assignment').add_action(join) assignment = Optional(R('^\d{1,2}$')('number') + Optional(W('×')).hide()) + (assignment_options + ZeroOrMore(T('CC').hide() + assignment_options)) note = (W('overlapped') | (W('×') + R('^\d+$')))('note').add_action(join) peak_meta_options = multiplicity | coupling | number | assignment | note peak_meta = W('(').hide() + peak_meta_options + ZeroOrMore(ZeroOrMore(delim) + peak_meta_options) + Optional(delim) + W(')').hide() delta = (R('^[δd][HCNPF]?$') + Optional(equals)).hide() ppm = Optional(R('^[(\[]$')) + Optional(I('in')) + I('ppm') + Optional(R('^[)\]]$')) spectrum_meta = Optional(W('(').hide()) + (frequency | solvent | delta | temperature) + ZeroOrMore(Optional(delim) + (frequency | solvent | I('ppm') | delta | temperature)) + Optional(temperature) + Optional(W(')').hide()) prelude_options = spectrum_meta | delta | delim | ppm.hide() | equals.hide() prelude = ((nucleus + Optional(R('^[\-–−‒]$')).hide() + nmr_name | nmr_name_with_nucleus) + ZeroOrMore(prelude_options)) | (R('^δ[HC]?$')('nucleus') + spectrum_meta + ZeroOrMore(prelude_options)) peak = Optional(delta) + (shift + Not(R('^M?Hz$')) + Optional(ppm).hide() + Optional(peak_meta))('peak').add_action(fix_nmr_peak_whitespace_error) peaks = (peak + ZeroOrMore(ZeroOrMore(delim | W('and')).hide() + peak))('peaks') nmr = (prelude + peaks)('nmr')
[docs]class NmrParser(BaseParser): """""" root = nmr parse_full_sentence = True
[docs] def __init__(self): pass
[docs] def interpret(self, result, start, end): c = self.model.fields['compound'].model_class() n = self.model( nucleus=first(result.xpath('./nucleus/text()')), solvent=first(result.xpath('./solvent/text()')), frequency=first(result.xpath('./frequency/value/text()')), frequency_units=first(result.xpath('./frequency/units/text()')), temperature=first(result.xpath('./temperature/value/text()')), temperature_units=first(result.xpath('./temperature/units/text()')) ) peak_model = self.model.fields['peaks'].field.model_class for peak_result in result.xpath('./peaks/peak'): nmr_peak = peak_model( shift=first(peak_result.xpath('./shift/text()')), multiplicity=first(peak_result.xpath('./multiplicity/text()')), coupling=first(peak_result.xpath('./coupling/value/text()')), coupling_units=first(peak_result.xpath('./coupling/units/text()')), number=first(peak_result.xpath('./number/text()')), assignment=first(peak_result.xpath('./assignment/text()')) ) n.peaks.append(nmr_peak) n.compound = c yield c