Automatic table parsing - complex models

This example demonstrates how complex model structures can automatically be extracted from tables, taking the appropriate required flags into account, for each submodel.

This is the table we want to parse (./data/table_example_3.csv):

Composition TC (K) Enthalpy (kJ) −S diff(J kg−1 K) CCK (J kg−1) Ref.
MnO2 293 5 9.5 410 52–55
MnO2 293 1 3.25 56
La2 (PWD) 337 1 2.70 68 57
La0.67Ba0.33 (TF) 292 5 1.48 161 26
La0.67Ca0.33 (TF) 252 5 2.08 175 26
La0.67Sr0.33MnO3 (TF) 312 1.5 1.54 50.16 58
La0.67Sr0.33MnO3 (TF) 321 1.5 1.47 34.24 58
Ba0.33Mn0.98Ti0.02O3 (PWD) 309 1 0.93 45 39
Ba0.33Mn0.98Ti0.02O3 (PWD) 309 5 3.19 307 39
Ba0.33Mn0.98Ti0.02O3 (TF) 286 5 3.35 220 This work
Ba0.33Mn0.98Ti0.02O3 (TF) 286 1 0.99 49 This work

We are interested in the Curie Temperature TC(K), so we create an appropriate model for it. Note that we are also assigning an enthalpy and a reference to the property, so we have to create models for those properties too, and nest them within the CurieTemperature model. The reference also contains a ficticious model that we call AbsentTemperature indicating that it will not be found in this table. Note that we are specifying required flags for each submodel. The contextual = True flag we have set for the compound model indicates that the compound could be merged from elsewhere in the document or the caption of the table.

In [1]:
from chemdataextractor.doc import Caption, Document
from chemdataextractor.doc.table import Table
from chemdataextractor.model import TemperatureModel, StringType, Compound, ModelType, DimensionlessModel
from chemdataextractor.model.units.energy import EnergyModel
from chemdataextractor.parse import W, R
from chemdataextractor.parse.auto import AutoTableParser
In [2]:
class AbsentTemperature(TemperatureModel):
    specifier = StringType(parse_expression=W('Nothing'), required=True)
    compound = ModelType(Compound, required=True, contextual=True)
    parser = [AutoTableParser()]

class Reference(DimensionlessModel):
    specifier = StringType(parse_expression=R('Ref'), required=True)
    compound = ModelType(Compound, required=True, contextual=True)
    absent_temperature = ModelType(AbsentTemperature, required=True, contextual=True)
    parsers = [AutoTableParser()]

class Enthalpy(EnergyModel):
    specifier = StringType(parse_expression=R('Enthalpy'), required=True)
    compound = ModelType(Compound, required=True, contextual=True)
    parsers = [AutoTableParser()]

class CurieTemperature(TemperatureModel):
    specifier = StringType(parse_expression=R(r'^\[?T(C|c)(urie)?[1-2]?\]?$'), required=True)
    compound = ModelType(Compound, required=True, contextual=True)
    enthalpy = ModelType(Enthalpy, required=True, contextual=True)
    reference = ModelType(Reference, required=False, contextual=True)
    parsers = [AutoTableParser()]

We then process a document containing only the table:

In [3]:
doc = Document(Table(caption=Caption(""), table_data="./data/table_example_3.csv"))
doc.models = [CurieTemperature]

for record in doc.records:
    print(record.serialize())
{'CurieTemperature': {'raw_value': '293', 'raw_units': '(K)', 'value': [293.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['MnO2']}}, 'enthalpy': {'Enthalpy': {'raw_value': '5', 'raw_units': '(kJ)', 'value': [5.0], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['MnO2']}}}}}}
{'CurieTemperature': {'raw_value': '293', 'raw_units': '(K)', 'value': [293.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['MnO2']}}, 'enthalpy': {'Enthalpy': {'raw_value': '1', 'raw_units': '(kJ)', 'value': [1.0], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['MnO2']}}}}}}
{'CurieTemperature': {'raw_value': '337', 'raw_units': '(K)', 'value': [337.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['La2']}}, 'enthalpy': {'Enthalpy': {'raw_value': '1', 'raw_units': '(kJ)', 'value': [1.0], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['La2']}}}}}}
{'CurieTemperature': {'raw_value': '292', 'raw_units': '(K)', 'value': [292.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['La0.67Ba0.33']}}, 'enthalpy': {'Enthalpy': {'raw_value': '5', 'raw_units': '(kJ)', 'value': [5.0], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['La0.67Ba0.33']}}}}}}
{'CurieTemperature': {'raw_value': '252', 'raw_units': '(K)', 'value': [252.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['La0.67Ca0.33']}}, 'enthalpy': {'Enthalpy': {'raw_value': '5', 'raw_units': '(kJ)', 'value': [5.0], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['La0.67Ca0.33']}}}}}}
{'CurieTemperature': {'raw_value': '312', 'raw_units': '(K)', 'value': [312.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['La0.67Sr0.33MnO3']}}, 'enthalpy': {'Enthalpy': {'raw_value': '1.5', 'raw_units': '(kJ)', 'value': [1.5], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['La0.67Sr0.33MnO3']}}}}}}
{'CurieTemperature': {'raw_value': '321', 'raw_units': '(K)', 'value': [321.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['La0.67Sr0.33MnO3']}}, 'enthalpy': {'Enthalpy': {'raw_value': '1.5', 'raw_units': '(kJ)', 'value': [1.5], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['La0.67Sr0.33MnO3']}}}}}}
{'CurieTemperature': {'raw_value': '309', 'raw_units': '(K)', 'value': [309.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}, 'enthalpy': {'Enthalpy': {'raw_value': '1', 'raw_units': '(kJ)', 'value': [1.0], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}}}}}
{'CurieTemperature': {'raw_value': '309', 'raw_units': '(K)', 'value': [309.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}, 'enthalpy': {'Enthalpy': {'raw_value': '5', 'raw_units': '(kJ)', 'value': [5.0], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}}}}}
{'CurieTemperature': {'raw_value': '286', 'raw_units': '(K)', 'value': [286.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}, 'enthalpy': {'Enthalpy': {'raw_value': '5', 'raw_units': '(kJ)', 'value': [5.0], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}}}}}
{'CurieTemperature': {'raw_value': '286', 'raw_units': '(K)', 'value': [286.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}, 'enthalpy': {'Enthalpy': {'raw_value': '1', 'raw_units': '(kJ)', 'value': [1.0], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}}}}}

As we can see, the reference is not founds in the records, considering that it was not required and it contained a model that was required and not present.

If we set the reference field to required = True, we will get no records at all:

In [4]:
CurieTemperature.reference.required = True

for record in doc.records:
    print(record.serialize())

Alternatively, if we set the absent_temperature to not required we will get everything, including the reference:

In [5]:
Reference.absent_temperature.required = False

for record in doc.records:
    print(record.serialize())
{'CurieTemperature': {'raw_value': '293', 'raw_units': '(K)', 'value': [293.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['MnO2']}}, 'enthalpy': {'Enthalpy': {'raw_value': '5', 'raw_units': '(kJ)', 'value': [5.0], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['MnO2']}}}}, 'reference': {'Reference': {'raw_value': '52–55', 'value': [52.0, 55.0], 'specifier': 'Ref', 'compound': {'Compound': {'names': ['MnO2']}}}}}}
{'CurieTemperature': {'raw_value': '293', 'raw_units': '(K)', 'value': [293.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['MnO2']}}, 'enthalpy': {'Enthalpy': {'raw_value': '1', 'raw_units': '(kJ)', 'value': [1.0], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['MnO2']}}}}, 'reference': {'Reference': {'raw_value': '56', 'value': [56.0], 'specifier': 'Ref', 'compound': {'Compound': {'names': ['MnO2']}}}}}}
{'CurieTemperature': {'raw_value': '337', 'raw_units': '(K)', 'value': [337.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['La2']}}, 'enthalpy': {'Enthalpy': {'raw_value': '1', 'raw_units': '(kJ)', 'value': [1.0], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['La2']}}}}, 'reference': {'Reference': {'raw_value': '57', 'value': [57.0], 'specifier': 'Ref', 'compound': {'Compound': {'names': ['La2']}}}}}}
{'CurieTemperature': {'raw_value': '292', 'raw_units': '(K)', 'value': [292.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['La0.67Ba0.33']}}, 'enthalpy': {'Enthalpy': {'raw_value': '5', 'raw_units': '(kJ)', 'value': [5.0], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['La0.67Ba0.33']}}}}, 'reference': {'Reference': {'raw_value': '26', 'value': [26.0], 'specifier': 'Ref', 'compound': {'Compound': {'names': ['La0.67Ba0.33']}}}}}}
{'CurieTemperature': {'raw_value': '252', 'raw_units': '(K)', 'value': [252.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['La0.67Ca0.33']}}, 'enthalpy': {'Enthalpy': {'raw_value': '5', 'raw_units': '(kJ)', 'value': [5.0], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['La0.67Ca0.33']}}}}, 'reference': {'Reference': {'raw_value': '26', 'value': [26.0], 'specifier': 'Ref', 'compound': {'Compound': {'names': ['La0.67Ca0.33']}}}}}}
{'CurieTemperature': {'raw_value': '312', 'raw_units': '(K)', 'value': [312.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['La0.67Sr0.33MnO3']}}, 'enthalpy': {'Enthalpy': {'raw_value': '1.5', 'raw_units': '(kJ)', 'value': [1.5], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['La0.67Sr0.33MnO3']}}}}, 'reference': {'Reference': {'raw_value': '58', 'value': [58.0], 'specifier': 'Ref', 'compound': {'Compound': {'names': ['La0.67Sr0.33MnO3']}}}}}}
{'CurieTemperature': {'raw_value': '321', 'raw_units': '(K)', 'value': [321.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['La0.67Sr0.33MnO3']}}, 'enthalpy': {'Enthalpy': {'raw_value': '1.5', 'raw_units': '(kJ)', 'value': [1.5], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['La0.67Sr0.33MnO3']}}}}, 'reference': {'Reference': {'raw_value': '58', 'value': [58.0], 'specifier': 'Ref', 'compound': {'Compound': {'names': ['La0.67Sr0.33MnO3']}}}}}}
{'CurieTemperature': {'raw_value': '309', 'raw_units': '(K)', 'value': [309.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}, 'enthalpy': {'Enthalpy': {'raw_value': '1', 'raw_units': '(kJ)', 'value': [1.0], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}}}, 'reference': {'Reference': {'raw_value': '39', 'value': [39.0], 'specifier': 'Ref', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}}}}}
{'CurieTemperature': {'raw_value': '309', 'raw_units': '(K)', 'value': [309.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}, 'enthalpy': {'Enthalpy': {'raw_value': '5', 'raw_units': '(kJ)', 'value': [5.0], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}}}, 'reference': {'Reference': {'raw_value': '39', 'value': [39.0], 'specifier': 'Ref', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}}}}}
{'CurieTemperature': {'raw_value': '286', 'raw_units': '(K)', 'value': [286.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}, 'enthalpy': {'Enthalpy': {'raw_value': '5', 'raw_units': '(kJ)', 'value': [5.0], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}}}, 'reference': {'Reference': {'raw_value': '286', 'value': [286.0], 'specifier': 'Ref', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}}}}}
{'CurieTemperature': {'raw_value': '286', 'raw_units': '(K)', 'value': [286.0], 'units': 'Kelvin^(1.0)', 'specifier': 'TC', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}, 'enthalpy': {'Enthalpy': {'raw_value': '1', 'raw_units': '(kJ)', 'value': [1.0], 'units': '(10^3.0) * Joule^(1.0)', 'specifier': 'Enthalpy', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}}}, 'reference': {'Reference': {'raw_value': '286', 'value': [286.0], 'specifier': 'Ref', 'compound': {'Compound': {'names': ['Ba0.33Mn0.98Ti0.02O3']}}}}}}