Source code for chemdataextractor.nlp.new_cem

# -*- coding: utf-8 -*-
"""
New and improved named entity recognition (NER) for Chemical entity mentions (CEM).
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import re

import six

from ..data import find_data
from .finetuned_bert_crf_wrapper import _BertCrfTagger
from .tag import EnsembleTagger, NER_TAG_TYPE
from .allennlpwrapper import _AllenNlpTokenTagger, ProcessedTextTagger, AllenNlpWrapperTagger

from allennlp.data.token_indexers import PretrainedBertIndexer


# Finetuned BERT to CRF
indexers = {
    "bert": PretrainedBertIndexer(do_lowercase=False, use_starting_offsets=True, truncate_long_sequences=False, pretrained_model=find_data("models/scibert_cased_vocab-1.0.txt")),
}

tokentagger = _AllenNlpTokenTagger()
processtagger = ProcessedTextTagger()


[docs]class BertFinetunedCRFCemTagger(AllenNlpWrapperTagger): """ A Chemical Entity Mention tagger using a finetuned BERT model with a CRF to constrain the outputs. """ tag_type = NER_TAG_TYPE indexers = indexers model = "models/bert_finetuned_crf_model-1.0a" overrides = {"model.text_field_embedder.token_embedders.bert.pretrained_model": find_data("models/scibert_cased_weights-1.0.tar.gz")}
[docs] def process(self, tag): return tag.replace("CEM", "CM")
[docs]class CemTagger(EnsembleTagger): """ A state of the art Named Entity Recognition tagger for both organic and inorganic materials that uses a tagger based on BERT with a Conditional Random Field to constrain the outputs. More details in the paper (https://pubs.acs.org/doi/full/10.1021/acs.jcim.1c01199). """ taggers = [tokentagger, processtagger, BertFinetunedCRFCemTagger()]