Source code for chemdataextractor.nlp.cem

# -*- coding: utf-8 -*-
"""
Named entity recognition (NER) for Chemical entity mentions (CEM).

This was the default NER system up to version 2.0, while the new NER is
included in new_cem.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import re

import six

from ..text import bracket_level
from .lexicon import ChemLexicon
from .tag import EnsembleTagger, CrfTagger, DictionaryTagger, NER_TAG_TYPE, POS_TAG_TYPE


log = logging.getLogger(__name__)


#: Token endings to ignore when considering stopwords and deriving spans
IGNORE_SUFFIX = [
    # Many of these are now unnecessary due to tokenization improvements, but not much harm in leaving them here.
    '-', '\'s', '-activated', '-adequate', '-affected', '-anesthetized', '-based', '-binding', '-boosted', '-cane',
    '-conditioned', '-containing', '-covered', '-deficient', '-dependent', '-derived', '-electrolyte', '-enriched',
    '-exposed', '-flanking', '-free', '-fused', '-gated', '-glucuronosyltransferases', '-increasing', '-induced',
    '-inducible', '-l-tyrosine', '-labeled', '-lesioned', '-loaded', '-mediated', '-patterned', '-primed', '-reducing',
    '-regulated', '-releasing', '-resistant', '-response', '-rich', '-s-transferase', '-sensitive', '-soluble',
    '-stimulated', '-stressed', '-supplemented', '-terminal', '-transferase', '-treated', '-type', '-blood',
    '-specific', '-like', '-elicited', '-stripped', '-transfer', '-conjugate', '-coated', '-producing', '-oxidized',
    '-associated', '-related', '-converting', '-ligand', '-on-glass', '-seeking', '-hydrolyzing', '-o-deethylase',
    '-deethylase', '-o-depentylase', '-depentylase', '-n-demethylase', '-demethylase', '-o-methyltransferase',
    '-c-oxidase', '-oxidase', '-n-biosidase', '-biosidase', '-immunoproteins', '-spiked', '-lowering', '-page',
    '-depletion', '-formation', '-dealkylation', '-deethylation', '-alkylation', '-ribosylation', '-production',
    '-demethylation', '-oxidation', '-transition', '-glycosylation', '-zwitterion', '-benzylation', '-reduction',
    '-oxygenation', '-nitrosylation', '-evoked', '-mutated', '-doped', '-aged', '-increased', '-triggered', '-linked',
    '-fixed', '-injected', '-contaminated', '-depleted', '-enhanced', '-stained', '-modified', '-fed', '-demethylated',
    '-catalyzed', '-etched', '-labelled', '-conjugated', '-pretreated', '-ribosylated', '-phosphorylated', '-reduced',
    '-bonded', '-stabilised', '-crosslinked', '-mannosylated', '-capped', '-supported', '-initiated', '-integrated',
    '-accelerated', '-encapsulated', '-untreated', '-expanded', '-coupled', '-terminated', '-assisted',
    '-permeabilized', '-resulted', '-alkylated', '-functionalized', '-contained', '-buffered', '-caused', '-cyclized',
    '-substituted', '-modulated', '-inhibited', '-centered', '-promoted', '-confirmed', '-provoked', '-dominated',
    '-limited', '-challenged', '-tetrabrominated', '-unesterified', '-refreshed', '-bottled', '-protonated',
    '-incubated', '-tagged', '-damaged', '-bridged', '-maintained', '-impregnated', '-metabolizing', '-deprived',
    '-insensitive', '-dendrimer', '-receptor', '-tolerant', '-influx', '-administrated', '-requiring', '-permeable',
    '-transport', '-intoxicated', '-overload', '-derivatives', '-derivative', '-sweetened', '-transporter', '-bound',
    '-extract', '-bonding', '-bond', '-trna', '-redistribution', '-copolymers', '-copolymer', '-appended',
    '-susceptible', '-transfected', '-bearing', '-regenerating', '-induction', '-conducting', '-decorated',
    '-encapsulating', '-consuming', '-bridge', '-dependence', '-Pdots', '-only', '-carrying', '-treating', '-isomerase',
    '-ion', '-ions', '-coordinated', '-saturated', '-sparing', '-enclosed', '-stabilized', '-polymer', '-yeast',
    '-making', '-porous', '-independent', '-metallized', '-attenuated', '-liquid', '-caged', '-deficiency', '-sensing',
    '-recognition', '-responsiveness', '-embedded', '-connectivity', '-abuse', '-chelating', '-decocted', '-forming',
    '-nutrition', '-scavenging', '-preferring', '-mimicking', '-drugs', '-drug', '-lubricants', '-adsorption',
    '-ligated', '-detected', '-responsive', '-reacting', '-defined', '-capturing', '-group', '-abstinent', '-paired',
    '-devalued', '-need', '-cellulose', '-atpase', '-inactivated', '-β-glucosaminidase', '-glucosaminidase', '-dosed',
    '-imprinted', '-precipitated', '-monoadducts', '-vacancies', '-vacancy', '-attributed', '-depolarization',
    '-depolarized', '-liver', '-testes', '-reversible', '-active', '-reactive', '-dextran', '-fixing', '-synthesizing',
    '-inhibitory', '-cleaving', '-positive', '-activity', '-fluorescence', '-regulating', '-NPs', '-scanning',
    '-water', '-nmr', '-limiting', '-refractory', '-knot', '-variable', '-biomolecule', '-backbone', '-exchange',
    '-donating', '-coating', '-hydrogenase', '-hydrogenases', '-intolerant', '-deplete', '-poor', '-loading',
    '-enrichment', '-elevating', '-resitant', '-stabilizing', '-pathway', '-fortified', '-adjusted',
    '-restricted', '-dependant', '-locked', '-normalized', '-aromatic', '-hydroxylation', '-intermediate',
    '-6-phosphatase', '-phosphatase', '-linker', '-proteomic', '-mimetic', '-lipid', '-radical', '-receptors',
    '-substrate', '-conjugates', '-promoting', '-dye', '-functionalyzed', '-catalysed', '-reductase', '-QDs',
    '-complexes', '-placebo', '-transferases', '-alginate', '-competing', '-depleting', '-sensitized',
    '-protein', '-regulatory', '-target', '-toxin', '-yield', '-planted', '-produced', '-derivatized', '-secreting',
    '-modifying', '-DNA', '-bonds', '-assemblages', '-exposure', '-negative', '-sealed', '-atom', '-atoms',
    '-abstraction', '-concentration', '-doping', '-competitive', '-acclimation', '-acclimated', '-interlinked',
    '-suppressed', '-postlabeling', '-labeling', '-diabetic', '-omitted', '-sufficient', '-generating', '-terminus',
    '-adducts', '-compound', '-compounds', '-γ-lyase', '-γ-synthase', '-lyase', '-synthase', '-inhibitor',
    '-protected', '-multiwall', '-stripping', '-plasma', '-evolving'
]

#: Token beginnings to ignore when considering stopwords and deriving spans
IGNORE_PREFIX = [
    'fluorophore-', 'low-', 'high-', 'single-', 'odd-', 'non-', 'high-', 'cross-', 'cellulose-', 'anti-', '-multiwall',
    'globular-', 'plasma-', 'hybrid-', 'protein-', 'explicit-', 'cation-', 'water-', 'through-', 'starch-', 'rigid-',
    'conjugated-', 'photoactivatable-', 'alginate-', 'nano-', 'dye-', 'ligand-', 'enzyme-', 'platelet-', 'photo-',
    'total-', 'drug-', 'nanoparticle-', 'nanomaterial-', 'inter-', 'ion-', 'post-', 'one-'
]

#: Final tokens to remove from entity matches
STRIP_END = [
    'groups', 'group', 'colloidal', 'dyes', 'dye', 'products', 'product', 'substances', 'substance', 'solution',
    'derivatives', 'derivative', 'analog', 'salts', 'salt', 'minerals', 'mineral', 'anesthetic', 'tablet', 'tablets',
    'preparation', 'atoms', 'atom', 'monomers', 'monomer', 'nanoparticles', 'nanoparticle', 'radicals', 'radical',
    'dendrimers', 'dendrimer', 'ions', 'ion', 'particles', 'particle', 'anion', 'cation', 'foam', 'cellulose',
    'dextran', '(', 'dust', 'herbicide', 'disease', 'diseases', 'and', 'or', ';', ',', '.'
]

#: First tokens to remove from entity matches
STRIP_START = [
    'anhydrous', 'elemental', 'amorphous', 'conjugated', 'colloidal', 'activated', 'water-soluble', 'total',
    'superparamagnetic', 'molecular', 'high-density', 'synthetic', 'low-density', 'long-chain', 'fused', 'radioactive',
    'reduced', 'anatase', 'dextran', ')', 'trisubstituted', 'deposited', 'herbicide', 'antagonist', 'agonist', 'and',
    'or', 'metallic', 'embryotoxic', 'monoclinic'
]

#: Disallowed tokens in chemical entity mentions (discard if any single token has exact case-insensitive match)
STOP_TOKENS = {
    'gene', 'inhibitor', 'genetical', 'human', 'recombinant', 'recombination', 'adenovirus', 'bovine', 'chicken',
    'sheep', 'pig', 'horse', 'mammalian', 'salmon', 'cytochrome', 'glycoprotein', 'genevrier', 'novartis', 'visfarm',
    'bristol-myers', 'squibb', 'allphar', 'bioniche', 'bipharma', 'chauvin', 'merck', 'procter', 'roche',
    'glaxo', 'glaxosmithkline', 'pfizer', 'ciba-geigy', 'interpharm', 'bayer', 'astrazeneca', 'aventis', 'behringer',
    'ratiopharm', 'pharmacia', 'apotex', 'novopharm', 'alpharma', 'schering', 'genzyme', 'aldrich', 'wiskott',
    'crossref', 'chemistry', '10.1039', '10.1021', '10.1186', 'doi', 'january', 'february', 'march', 'april', 'june',
    'july', 'august', 'september', 'october', 'november', 'december', 'esi', '†', '§', 'london', 'paris', 'tokyo',
    'york', 'angeles', 'francisco', 'berlin', 'bristol', 'southampton', 'edinburgh', 'chicago', 'cambridge', 'oxford',
    'parameters', 'volume', 'dielectric', 'cm–1', 'measurements', 'studies', 'imaging', 'ccdc', 'sigma-aldrich',
    'scientifique', 'china', 'fig.', 'approach', 'colored', 'isbn', 'having', 'background', 'method', 'methods',
    'results', 'discussion', 'introduction', 'conclusion', 'conclusions', 'prior', 'technical', 'nano-beads',
    'nanobeads', 'test', 'production', 'priority', 'claim', 'claims', 'journal', 'journals', 'letters', 'phenomena',
    'article', 'articles', 'ethical', 'guidelines', 'editor', 'editors', 'profile', 'editorial', 'masthead', 'citing',
    'download', 'citation', 'members', 'privacy', 'policy', 'help', 'chemworx', 'biochemistry', 'energy', 'more',
    'syntheticpage', 'contact', 'fluorochem', '.cdx', '.sk2', 'email', 'affiliation', 'affiliations', 'bibtex',
    'medline', 'marinlit', 'chemspider', 'permissions', 'ekins', 'edit', 'links', 'link', 'english', 'italiano',
    'esperanto', 'español', 'wikimedia', 'upload', 'file', 'account', 'personal', 'navigation', 'menu', 'external',
    'references', 'safety', 'pharmacology', 'coffee', 'research', 'bibliography', 'tobacco', 'palestine', 'doctrine',
    'napoleon', 'azərbaycanca', 'euskara', 'latviešu', 'nordfriisk', 'नेपाल भाषा', 'children', 'overdose', 'chocolate',
    'systematic', 'google', 'literature', 'books', 'docking', 'chromatography', 'libraries', 'retention', 'index',
    'danielle', 'claire', 'rachel', 'zhang', 'linkedin', 'magazine', 'america', 'ireland',

}

#: Disallowed substrings in chemical entity mentions (only used when filtering to construct the dictionary?)
STOP_SUB = {
    'botulinum', 'plasminogen', 'necrosis', 'exciton', 'glucan', 'fibroblast', 'follicle', 'natriuretic', 'luteinizing',
    'insulin', 'platelet', 'glucagon', 'activating factor', 'necrosis factor', 'growth factor', ' with ', ' brand of ',
    'transcription factor', ' oil', 'oil of ', ', ', '?', '!', '\\', '|', '@', ';', '%', 'stimulating factor',
    'coagulation factor', 'neurofilament', 'freund', 'anticodon', 'neuropeptide', 'intercellular', 'gene-related',
    'selectin', 'reactive', 'interleukin', 'gramicidin', 'melanin', 'corticotropin', 'corticotrophin',
    'adrenocorticotropic', 'hemoglobin', 'concanavalin', 'factor ', 'releasing factor', 'regulator', 'transmembrane',
    'conductance', 'interferon'
}

#: Disallowed chemical entity mentions (discard if exact case-insensitive match)
STOPLIST = {
    'gold', 'lead', 'yellow', 'epidermal growth factor', 'pristine', 'transdermal patch', 'olive oil', 'groundnut oil',
    'telomerase', 'transdermal patch', 'cascade', 'agar', 'distilled water', 'water', 'alpha-actinin-4', 'iberiotoxin',
    'alginate', 'pancreatin', 'starch', 'iκbα', 'insulin', 'cetuximab', 'il-2', 'serum albumin', 'discover', 'glycine',
    'roundup', 'balance', 'glycogen', 'epidermal growth factor (egf)', 'polysaccharide', 'ginseng', 'hemoglobin',
    'hydroxypropylcellulose', 'advantage', 'petroleum ether', 'gypsum', 'light yellow', 'cadmium chloride (cdcl2)',
    'histone', 'absolute ethanol', 'activated charcoal', 'puerarin', 'total bilirubin', 'collagenase', 'capmul',
    'cremophor el', 'ubiquitin', 'glp-1', 'glucagon-like peptide-1', 'vinegar', 'accelerate', 'Nucleophosmin',
    'deionized water', 'betula', 'pectin', 'pectins', 'furosemide', 'bumetanide', 'teac', 'dept', 'plumbago',
    'cytochrome c', 'ndma', 'ultimate', 'triticum', 'ubiquinone', 'artemisinin', 'cytochrome p450', 'parkin', 'proton',
    'elevate', 'lime', 'corn oil', 'hydrogel', 'activin', 'amylin', 'raven', 'nerve agent', 'collagen', 'gradual',
    'probiotic', 'akron', 'spotlight', 'meta-analysis', 'osteopontin', 'integrin', 'glycoproteins', 'classic',
    'silence', 'first sign', 'compendium', 'prothrombin', 'blood coagulation factor x', 'advance', 'insular',
    'tarragon', 'mutagen', 'agarose', 'glycoprotein', 'maintain a', 'inhalable', 'adrenocorticotropic hormone',
    'cyclin d1', 'cyclin d3', 'chitosan', 'cellulose', 'betaine', 'thromboplastin', 'thrombin',
    'factor x', 'plasminogen', 'exciton', 'growth hormone', 'placental growth hormone', 'aprotinin', 'glucans',
    'latex particles', 'piper', 'corticotropin', 'dixon', 'bengal', 'fret-capture', 'intense blue', 'singlet oxygen',
    'oil-in-water', 'water-in-oil', 'protio', 'crotoxin', 'oil', 'cocktail', 'nodular', 'interceptor', 'fibrinogen',
    'dams', 'lotion', 'consist', 'mascot', 'radio', 'prep', 'ac187', 'pima', 'biopterin', 'dalteparin', 'enoxaparin',
    'lmwh', 'angiotensinogen', 'revolution', 'trails', 'am1', 'xanthium', 'ω', 'noxa', 'sepharose', 'melanin', 'ricin',
    'trypsinogen', 'conserve', 'preview', 'barrels', 'hemozoin', 'recruit', 'dragon', 'acacia', 'homogentisate',
    'triangle', 'vortex', 'reconcile', 'aversion', 'ubr2', 'calcitonin', 'samp', 'xanthan', 'ascophyllum', 'vicilin',
    'maltodextrin', 's100', 'maltodextrin', 'spme', 'p300', 'p450', 'hyaluronidase', 'osteocalcin', 'm41.4', 'genesis',
    'tnfα', 'herceptin', 'laba', 'teriparatide', 'rutile', 'harness', 'hyperoxia', 'adalimumab',
    'cholecystokinin', 'counter', 'acth', 'raptor', 'comet', 'regulon', 'erythropoietin', 'dextran', 'metallothionein',
    'perna', 'carotenoids', 'carotenoid', 'propolis', 'amylose', 'amylopectin', 'ovalbumin', 'ovomucoid', 'gelatin',
    'gemini', 'imperator', 'rubber', 'pak1', 'eristostatin', 'heparin', 'dynorphins', 'dynorphin', 'concise',
    'antitussive', 'maneb', 'foxo1', 'octadecaneuropeptide', 'oligonucleotide', 'prolactin', 'cocktail', 'carotene',
    'pgc1α', 'hyaluronan', 'nucleophosmin', 'thyroglobulin', 'carrageenan', 'abbott', 'chymotrypsin', 'excel',
    'polyubiquitin', 'gelatine', 'carboxymethylcellulose', 'urokinase', 'invader', 'belatacept', 'ferritin', 'casein',
    'taxus', 'proopiomelanocortin', 'capture', 'chitosan', 'cellulose', 'betaine', 'thromboplastin', 'thrombin',
    'aprotinin', 'xyloglucan', 'glucans', 'piper', 'corticotropin', 'dixon', 'bengal', 'protio', 'δr(1)', 'crotoxin',
    'amphiregulin', 'fulfill', 'scpa', 'freedom', 'hemopexin', 'transferrin', 'chemokine', 'pegsunercept',
    'iscu', 'auroxanthin', 'vanilla', 'spectrin', 'caprine', 'picrate', 'fucoidan', 'talin', 'polypeptide', 'mag2',
    'mag2\'s', 'ethylcellulose', 'calcined', 'interferon', 'b13', 'mibc', 'inulin', 'trastuzumab', 'aurora', 'trypsin',
    'cubes', 'pampa', 'actomyosin', 'bevacizumab', 'avastin', 'cd3(+)', 'collagens', 'n17', 'agcg', 'cd2+', 'cd4+',
    'cd2', 'horizon', 'denosumab', 'fgf2', 'campaign', 'authority', 'danshen', 'dinucleotide', 'momentum', 'botox',
    'epoetin', 'adipsin', 'glycerin', 'curcuma', 'alum', 'bile', 'fibroin', 'octreotide', 'pursuit', 'elastin',
    'elastomers', 'shellac', 'dash', 'sp1', 'exotoxin', 'icatibant', 'glucomannan', 'vas1', 'bacteriorhodopsin',
    'saline', 'emotion', 'surpass', 'angiotensin', 'hydroxyethylcellulose', 'rifle', 'chamomile', 'keratin',
    'synacthen', 'lignin', 'd250', 'carrageenin', 'lama', 'maba', 'pink', 'lady', 'fenugreek', 'mannan', 'mustard',
    'cellulase', 'cornstarch', 'thioredoxin', 'persian', 'cultivate', 'scot', 'agglutinin', 'ta98', 'rock', 'pc12',
    'metric', 'ferredoxin', 'adrenodoxin', 'glycopeptide', 'benchmark', 'aopp', 'fractal', 'cd3ε', 'perk', 'dihydro',
    'relaxant', 'clin', 'hemocyanin', 'gleevec', 'ltb4', 'pla2', 'arsenal', 'lignocellulose', 'pat4', 'chitin',
    'resovist', 'hypo', 'coral', 'supreme', 'sv2', 'methylcellulose', 'honey', 'squalamine', 'arabinogalactan', 'mega',
    'steel', 'resilin', 'percolate', 'avicel', 'methemoglobin', 'methb', 'lrp1', 'lats', 'opium', 'cd68', 'valiant',
    'hydroxypropylmethylcellulose', 'pseudo', 'carbomer', 'gallery', 'silicone', 'atpγs', 'pc1', 'murabutide', 'dnase',
    '(gaba)ergic', 'hmqc', 'amitraz', 'indigo', 'gtpγs', 'thioredoxins', 'exenatide', 'hairy', 'sunshine', 'star',
    'spme', 'maltodextrin', 's100', 'vicilin', 'ascophyllum', '[h2o2]', 'xanthan', 'samp', 'calcitonin', 'ubr2',
    'aversion', 'reconcile', 'vortex', 'triangle', 'homogentisate', 'acacia', 'dragon', 'recruit', 'hemozoin',
    'barrels', 'preview', 'conserve', 'trypsinogen', 'ricin', 'melanin', 'sepharose', 'noxa', 'ω', 'xanthium', 'trails',
    'revolution', 'angiotensinogen', 'igaba', 'pullulan', 'lmwh', 'enoxaparin', 'fenton', 'meta', 'active carbon',
    'alamethicin', 'bionic', 'dynorphin', 'anterior pituitary hormone', 'gonadotropin releasing hormone',
    'follicle-stimulating hormone', 'adrenocorticotrophic hormone', 'luteinizing hormone', 'luteinising hormone',
    'parathyroid hormone', 'anterior pituitary hormone', 'gonadotropin releasing hormone',
    'adrenocorticotropic hormone', 'thyroid stimulating hormone', 'corticotrophin-releasing hormone',
    'antidiuretic hormone', 'titan', 'anion', 'ion', 'counter-anion', 'counter-ion', 'polypeptide', 'scopolamine',
    'stainless steel', 'danshen', 'cholera toxin', 'thymosin β4', 'sesame oil', 'blood sugar', 'liposomal doxorubicin',
    'fusarium toxin', 'chondroitin sulfate', 'silybum marianum', 'milk thistle', 'amorphous silica',
    'dna double strand', 'cadherin 11', 'conjugated linoleic acid', 'reduced hemoglobin', 'citrus pectin',
    'double stranded dna', 'a chlorophyll', 'flaxseed oil', 'linseed oil', 'peppermint oil', 'clathrin heavy chain',
    'avicel ph101', 'insulin glargine', 'mustard oil', 'chondroitin sulphate', 'provitamin a', 'white light',
    'tea polyphenol', 'dermatan sulfate', 'water vapor', 'clove oil', 'heparan sulfate', 'neurokinin a',
    'polystyrene latex', 'schisandra chinensis', 'gum arabic', 'linseed oils', 'hydroxypropyl methylcellulose',
    'part 2', 'partially hydrolyzed polyacrylamide', 'oil red', 'amorphous carbon', 'microcrystalline cellulose',
    'castor oil', 'peptide e', 'darbepoetin alfa', 'epoetin beta', 'epoetin alfa', 'factor iia', 'natural rubber',
    'xanthan gum', 'aromatic amine', 'uranyl nitrate', 'conjugated estrogens', 'shiga toxin', 'wheat starch',
    'psychogenic', 'glucagon', 'c-peptide', 'zymosan', 'vertex', 'turpentine', 'turmeric', 'c-reactive protein',
    'caramel', 'corn starch', 'contest', 'cont', 'double-stranded dna', 'emerald', 'eminent', 'endurance', 'benet',
    'doyle', 'adept', 'alliance', 'spiegel', 'dial', 'dorm', 'elon', 'vasal', 'orion', 'wander', 'synthol', 'dibs',
    'bide', 'arena', 'musk', 'warf', 'alpen', 'happy', 'baron', 'quench', 'accent', 'zest', 'orphan', 'ural', 'snip',
    'bridal', 'arrow', 'essex', 'dwell', 'diana', 'tops', 'slam', 'soda', 'poly', 'polygon', 'crest', 'minus', 'legend',
    'atrium', 'tara', 'tunic', 'trim', 'citizen', 'brace', 'mighty', 'strike', 'triumph', 'avenge', 'magnum', 'salute',
    'lance', 'cutlass', 'lancer', 'vanquish', 'summit', 'edge', 'ravage', 'tough', 'clout', 'versed', 'banner', 'orbit',
    'radar', 'tilt', 'epic', 'turbo', 'austin', 'raiser', 'racer', 'vengeance', 'commando', 'morph', 'raid', 'joker',
    'cannon', 'command', 'patrol', 'snap', 'halt', 'procure', 'octave', 'tackle', 'stanza', 'verdict', 'empire',
    'peon', 'beret', 'vigil', 'assure', 'sprinkle', 'clipper', 'parlay', 'finale', 'ignite', 'liberty', 'chopper',
    'sceptre', 'pivot', 'assert', 'dagger', 'stalker', 'touchdown', 'brigade', 'spotless', 'facet', 'punch', 'sanction',
    'daren', 'gallant', 'cinch', 'grasp', 'splendor', 'dorado', 'rally', 'storm', 'icon', 'karate', 'sirius', 'alto',
    'centurion', 'prism', 'battalion', 'manage', 'cameo', 'volley', 'elite', 'canopy', 'shogun', 'finish', 'mandate',
    'visor', 'dividend', 'titus', 'limber', 'safari', 'vishnu', 'sultan', 'melody', 'compass', 'flint', 'prosper',
    'jumbo', 'marksman', 'headline', 'olympus', 'granite', 'allay', 'redskin', 'clay', 'butter', 'belt', 'apolar',
    'retard', 'pensive', 'probate', 'tranquil', 'match', 'android', 'paraffin', 'vaseline', 'goon', 'mist', 'peace',
    'scuffle', 'surfer', 'whack', 'rogue', 'senna', 'restful', 'tabloid', 'ecstasy', 'kelp', 'redux', 'leader',
    'pledge', 'squad', 'sentry', 'cola', 'flake', 'flex', 'bazooka', 'bernice', 'blizzard', 'blow', 'bump', 'candy',
    'carrie', 'caviar', 'cecil', 'charlie', 'coca', 'cola', 'coke', 'heaven', 'hell', 'kokan', 'snort', 'toke',
    'toot', 'snow', 'crack', 'emblem', 'fore', 'oliver', 'pride', 'brake', 'sonar', 'talon', 'maya', 'sophia', 'millie',
    'estate', 'blazer', 'latex', 'tomahawk', 'embark', 'apron', 'subdue', 'saffron', 'prowl', 'stomp', 'diane',
    'redeem', 'confront', 'comfort', 'artist', 'mirage', 'brass', 'expand', 'escort', 'curb', 'recoil', 'anna',
    'bishop', 'sword', 'sonata', 'smear', 'staple', 'tell', 'merlin', 'whip', 'amen', 'apex', 'beast', 'yellows',
    'smack', 'crap', 'harry', 'junk', 'rufus', 'stuff', 'rival', 'vermin', 'snip', 'monarch', 'flair', 'paladin',
    'rugby', 'fortress', 'zodiac', 'stipend', 'equity', 'terminator', 'tara', 'jolt', 'sniper', 'rampart', 'amaze',
    'briton', 'tsar', 'regent', 'admire', 'merit', 'calypso', 'chess', 'endeavour', 'calibre', 'aero', 'citation',
    'combat', 'scourge', 'enforcer', 'hydro', 'tindal', 'applaud', 'adage', 'marshal', 'marshall', 'posse', 'stim',
    'relax', 'quark', 'joust', 'crank', 'dolly', 'rotate', 'tattoo', 'crunch', 'glut', 'pirate', 'sherpa', 'pylon',
    'ripcord', 'ammo', 'barricade', 'demon', 'colt', 'drago', 'fury', 'mustang', 'concord', 'ambush', 'corsair',
    'dragnet', 'kestrel', 'outflank', 'picket', 'pounce', 'kudos', 'lasso', 'scout', 'baroque', 'aria', 'borneo',
    'dart', 'dreamer', 'hocus', 'pegasus', 'morpho', 'polo', 'anon', 'henna', 'lawson', 'malo', 'bloom', 'carol',
    'moment', 'tonal', 'duet', 'solo', 'commodore', 'matador', 'grenade', 'saber', 'crackdown', 'suspend', 'herald',
    'tame', 'smash', 'barrage', 'miracle', 'crossbow', 'vacate', '∑pcbs', 'anti-stress', 'aqua',
    'activated carbon', 'bantu', 'barstar', 'barnase', 'barnase-barstar', 'bishop-kirtman', 'bromelain',
    'bromelia', 'calcitonin', 'carob', 'cocoa butter', 'complement proteins', 'metal-oxide', 'complement proteins',
    'waters', 'aldrich', 'wang', 'xylan', 'transfer rna', 'tough', 'torpedo', 'saccharum', 'saccharina', 'ifn-γ',
    'igf-1', 'starches', 'btx-a', 'bont/a', 'exendin-4', 'snap-25', 'liraglutide', 'tsst-1', 'neuropeptide y',
    'hyaluronic acid', 'il-11', 'phycocyanin', 'gm-csf', 'papp-a', 'moesin', 'endothelin-1', 'adma', 'dm-10', 'ghrp-2',
    'dnase-i', 'phaseolin', 'e-selectin', 'iκb-α', 'protein hydrolysate', 'interferon-γ', 'omalizumab',
    'dextran sulfate sodium', 'avicel cl611', 'diurnal', 'phosphor', 'chorionic gonadotropin', 'orphanin fq', 'ccl3',
    'potato starch', 'protide', 'm1-glucuronide', 'bacp-2', 'glucophage', 'fly ash', 'galanin', '1,3-dpma', 'gst-p(+)',
    'angiotensin i', 'ndp-α-msh', '1,5-dpma', 'nociceptin', 'lipoteichoic acid', 'ifn-gamma', 'actinin-4',
    'margarine', 'cyclones', 'motilin', 'metallothioneins', 'h3n2', 'ip-10', 'chondroitin', 'concanavalin a',
    'p-selectin', 'se-selectin', 'natalizumab', 'kollidon', 'mpo-anca', 'gypsum fibrosum', 'α-msh', 'k-12', 'gana',
    'af-2', 'brs-3-ap', 't-pa', 'rgd peptide', 'ccl3(-/-)', 'α-lactalbumin', 'e-ssa', 'ifn-β', 'ziconotide', 'inas',
    'heat pre', 'an-152', 'chorionic gonadotrophin', 'radixin', 'protanal', 'bloc', 'mops', 'guardian', 'orange',
    'pser-stat3', 'molybdate', 'galsulfase', 'reticulin', 'pyrethrum', 'nociceptin', 'growth hormone releasing hormone',
    'nor-1', 'protamine', 'lipid a', 'ribonucleic acid', 'hirudin', 'c-15', 'sephadex lh-20', 'pro-opiomelanocortin',
    'nida', 'tenax', 'cochineal', 'b-dna', 'arac', 'poly(a)-poly(t)', 'tm-74', 'factor v', 'e3330', 'flonase',
    'defibrotide', 'salix', 'interferon-gamma', 'desmethyl-olanzapine', 'pc-12', 'dpma', 'cope-bd', 'textile', 'ddds',
    'tace', 'lignins', 'gramicidin a', 'il10', 'substance p', 'poloxamer', '12mg', 'flue gas', 'isomaltosaccharide',
    'vinca', 'actinin-4', 'il12', 'siamycin', 'naglazyme', 'bnp-32', 'tea catechin', 'gastrin', 'fp-2', 'somatostatine',
    'dot-silica', 'alpha-t', 'fetal hemoglobin', 'dextrin', 'mica', '5\'-amp', 't-47', 'vinculin', 'lsopc', '3ps',
    '3 ps', 'lead ion', 'french green', 'mg-1', 'rosin', 'rice starch', 'nexus', 'factor vii', 'eculizumab', 'methocel',
    'hydrolyzed polyacrylamide', 'glide', 'hyalgan', 'synvisc', 'octanol-air', 'furfural-water', '90th', 'poly(i:c)',
    'gsno', 'mtcc', 't140', 'sephadex g-75', 'valosin-containing protein', 'deoxyribonucleic acid', 'jasmonate',
    'peony', 'menopur', 'daclizumab', 'gluten proteins', 'the-7', 'microdots', 'b(+)', 'β-endorphin', 'amberlite',
    'supra', 'β-nf', 'optimizer', 'orbit', 'spirit', 'rhombic', 'green tea leaves', 'alum', 'a', 'about', 'again',
    'all', 'almost', 'also', 'although', 'always', 'among', 'an', 'and', 'another', 'any', 'are', 'as', 'at', 'be',
    'because', 'been', 'before', 'being', 'between', 'both', 'but', 'by', 'can', 'could', 'did', 'do', 'does', 'done',
    'due', 'during', 'each', 'either', 'enough', 'especially', 'etc', 'for', 'found', 'from', 'further', 'had', 'has',
    'have', 'having', 'here', 'how', 'however', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'itself', 'just', 'kg',
    'km', 'made', 'mainly', 'make', 'may', 'mg', 'might', 'ml', 'mm', 'most', 'mostly', 'must', 'nearly', 'neither',
    'no', 'nor', 'obtained', 'of', 'often', 'on', 'our', 'overall', 'perhaps', 'pmid', 'quite', 'rather', 'really',
    'regarding', 'seem', 'seen', 'several', 'should', 'show', 'showed', 'shown', 'shows', 'significantly', 'since',
    'so', 'some', 'such', 'than', 'that', 'the', 'their', 'theirs', 'them', 'then', 'there', 'therefore', 'these',
    'they', 'this', 'those', 'through', 'thus', 'to', 'upon', 'use', 'used', 'using', 'various', 'very', 'was', 'we',
    'were', 'what', 'when', 'which', 'while', 'with', 'within', 'without', 'would', 'alcoholic', 'streptavidin',
    'urea nitrogen', 'molten', 'fructose corn syrup', 'human serum albumin', 'Ω127', 'vitamin', 'vitamins',
    'test mixture', 'prothrombinase', 'terpolymer', 'methods', 'section', 'reactivity', 'safety', 'method',
    'experimental', 'discussion', 'synthesis', 'experimental procedures', 'general experimental', 'introduction',
    'results', 'reactions', 'stability', 'multiple', 'crystallography', 'compound', 'syntheses', 'crystal',
    'conclusion', 'references', 'charged', 'nucleotide', 'candidate molecules', 'heteroatoms', 'solvent', 'preparation',
    'formulation', 'omega', 'mol', 'res', 'log in', 'heterocyclic', 'organometallic', 'organometallics', 'nucleobase',
    'organometalloidal', 'acs nano', 'acs omega', 'acs mobile', 'mobile site', 'new titles', 'x close', 'press',

    # Removed: 'gaba', 'gaba(a)', 'alcohol', 'oxide', 'ccl4', 'sugar',
}

#: Regular expressions that define disallowed chemical entity mentions. Note: the entity text is passed as lowercase.
STOP_RES = [
    '^(http|ftp)://',  # URL
    '\.(com|uk|eu|org|net)$',  # URL
    '^\d{4}-\d{3}[\dx]$',  # ISSN
    '^[\w\-\.\+%]{4,} @ \w[\w\-\.]+\.(com?|edu|gov|ac)(\.[\w\-\.]+)?$',  # email
    '^[\d,:\- ]*\d{4,}[\d,:\- ]*$',  # numbers
    '\d{3,} , \d{3,}',  # numbers
    '(\d\d+\.\d+|\d\.\d\d+)',  # numbers
    '\d and \d',  # numbers
    '^(\[\d+\]\s*)+$',  # numbers
    '^\d+$',  # numbers
    '= \d',  # numbers
    '^\+?\d[ \d-]$',  # phone numbers
    'cm-1',  # units
    '^(compound|ligand|chemical|dye|derivative|complex|example|intermediate|product|formulae?)s? [a-z\d]{1,3}',  # labels
    '(b3lyp|31g\(d,p\)|td-dft)',
    'et al\.?$',
    '^(ep|wo|us)\s*\d\s*\d\d[\d\s]*([AB]\d)?($|\s*and)',  # patent numbers
    '^(pre|post)-\d\d\d\d',  # common mistake
    '\d ml$',  # properties
    '\.(png|gif|jpg|txt|html|docx?|xlsx?)$',  # File extensions
    '^(tel|fax)\s*:?\s*\+?\s*\d',  # phone numbers
]

#: Regular expressions defining collections of words that should be split if joined by hyphens or -to-
SPLITS = [
    '^(actinium|aluminium|aluminum|americium|antimony|argon|arsenic|astatine|barium|berkelium|beryllium|bismuth|bohrium|boron|bromine|cadmium|caesium|calcium|californium|carbon|cerium|cesium|chlorine|chromium|cobalt|copernicium|copper|curium|darmstadtium|dubnium|dysprosium|einsteinium|erbium|europium|fermium|flerovium|fluorine|francium|gadolinium|gallium|germanium|gold|hafnium|hassium|helium|holmium|hydrargyrum|hydrogen|indium|iodine|iridium|iron|kalium|krypton|lanthanum|lawrencium|lead|lithium|livermorium|lutetium|magnesium|manganese|meitnerium|mendelevium|mercury|molybdenum|natrium|neodymium|neon|neptunium|nickel|niobium|nitrogen|nobelium|osmium|oxygen|palladium|phosphorus|platinum|plumbum|plutonium|polonium|potassium|praseodymium|promethium|protactinium|radium|radon|rhenium|rhodium|roentgenium|rubidium|ruthenium|rutherfordium|samarium|scandium|seaborgium|selenium|silicon|silver|sodium|stannum|stibium|strontium|sulfur|tantalum|technetium|tellurium|terbium|thallium|thorium|thulium|tin|titanium|tungsten|ununoctium|ununpentium|ununseptium|ununtrium|uranium|vanadium|wolfram|xenon|ytterbium|yttrium|zinc|zirconium)$',
    '^(Ag|Al|Ar|Au|Br|Cd|Cl|Co|Cu|Fe|Gd|Ge|Hg|Kr|Li|Mg|Na|Ne|Ni|Pb|Pd|Pt|Ru|Sb|Si|Sn|Ti|Xe|Zn|Zr|Zn)$',
    '^(iodide|triiodide|nitrite|nitrate)$',
    '^(graphane|graphene|carbon|silica|glucose)$',
    '^(sugar|phospate)$',
    '^(azide|alkyne|alkene|alkane)$',
    '^(arginine|cysteine|glycine|aspartic acid|glutamate|dopamine|serotonin|acetone|methanol|ethanol|EtOH|MeOH|AcOEt|melatonin|leucine|alanine|histidine|isoleucine|lysine|threonine|tryptophan|nicotine|gentamicin|ATP|FITC|biotin|tamoxifen|catechin|asparagine)$',
    '^(Ala|Arg|Asn|Asp|Cys|Glu|Gln|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val)(?:\(?\d+\)?)?$',
    '^(\(?1\)?H|\(?1[45]\)?N|\(?1[234]\)?C|\(?19\)?F)$',
    '^(F|Cl|Zn[OS]|H\(?2\)?O(\(?2\)?)?|Ni\(OH\)\(?2\)?|(NiF|SnO|TiO|NO)\(?2\)?|(Al|Y|Fe)\(?2\)?O\(?3\)?|CaCO\(?3\)?)$',
    '^(ester|amide)$'
]

# Special case boundary adjustments (only used for cems output)
SPECIALS = [
    '(?:^|-)([CONS])-\w+ases?$',
    '(?:^|-)(S)-(sulfonates?)$',
    '^(GABA)-(benzodiazepine|A)$',
    '^(ZnO|Au|Ag)-NPs?$',  # Nanoparticles
    '^(UDP)-.+ase$',  # UDP-enzyme
    '^(UDP)-(.+)$',  # UDP-other
    '^(N|S)-(?:acetyl|nitros|hydroxyl)(?:ation|ated)$',
    '^-(NO2|CH3|F|Cl|Br|OH)$',  # Remove leading dash
    '^(.+)[²³]?⁺$',  # Remove trailing superscript plus \u207a
    '^δ(\([^\(]+\).+)$',  # Remove leading δ \u03b4 but keep opening bracket if the closing bracket is within name
    '^δ\((.+)\)?$',  # Otherwise remove leading δ \u03b4 and opening bracket
    '^(Ala|Arg|Asn|Asp|Cys|Glu|Gln|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val)-?\(?\d+\)?$',
    '^(\w{4,})(?:\)?-to-\(?|\+)(\w{4,})$',
    '^(.+)(?:-| )(?:linker|activated|gated|mediated|containing|doped|labeled|coated|enriched|catalyzed|modified|···)(?:-| )(.+)$',
    '^(.+)\s?···\s?(.+)$',
    '^(.+[A-Z])\+([A-Z].+)$',  # Split on plus surrounded by uppercase alpha
    '^([^\(\)]+\w)\+(\w[^\(\)]+)$',  # Split on plus surrounded by any letter or number provided no brackets
    '^(.+\(\d+[A-Za-z]*\)) and (.+)$',  # Split on bracketed alphanumeric label followed by and
    '^(.+)\.$',  # Trim off final punctuation

    #'^((?:.* )acid)-(.+)$',
    # TODO: Slash-separated names? ', ' separated? ' and ' separated? Probably have a min length limit
]


class _CompatibilityToken(object):
    """
    A wrapper around tokens to ensure backwards compatibility when using RichTokens.

    Older taggers utilise indices within tokens to mean special things. 0 is the text, while
    1 refers to the POS tag *or* the combined NER and POS tag depending on the stage of tagging.

    :class:`~chemdataextractor.doc.text.RichToken` maintain backwards compatibility at a high level
    by using 1 to refer to the combined NER and POS tag and automatically calculate this when
    any parser requires this. This allows any older parsing code which also rely on this indexing
    behaviour to remain unchanged, but causes problems with the older NER taggers as they end up in
    infinite recursion when trying to get index 1, which these taggers expect to contain the POS tag,
    but :class:`~chemdataextractor.doc.text.RichToken` takes to mean the combined POS and NER tag,
    calling the NER tagger's tagging routine again.

    This object exists to get around this. Either way that the user tries to access the results in
    this class, whether it be via calling token["pos_tag"] or token[1], it should return the expected
    result.
    """

    def __init__(self, text, pos_tag, prefetched_tags=None):
        self.pos_tag = pos_tag
        self.text = text
        if prefetched_tags is None:
            prefetched_tags = {}
        self.prefetched_tags = prefetched_tags

    def __getitem__(self, key):
        if key == 0:
            return self.text
        elif key == 1 or key == POS_TAG_TYPE:
            return self.pos_tag
        elif key in self.prefetched_tags:
            return self.prefetched_tags[key]
        raise IndexError("Index out of range")


[docs]class CiDictCemTagger(DictionaryTagger): """Case-insensitive CEM dictionary tagger.""" tag_type = NER_TAG_TYPE lexicon = ChemLexicon() model = 'models/cem_dict-1.0.pickle'
[docs]class CsDictCemTagger(DictionaryTagger): """Case-sensitive CEM dictionary tagger.""" tag_type = NER_TAG_TYPE lexicon = ChemLexicon() model = 'models/cem_dict_cs-1.0.pickle' case_sensitive = True
[docs]class CrfCemTagger(CrfTagger): """""" tag_type = NER_TAG_TYPE model = 'models/cem_crf_chemdner_cemp-1.0.pickle' lexicon = ChemLexicon() clusters = True params = { 'c1': 1.0, # Coefficient for L1 regularization (OWL-QN). Default 0. 'c2': 0.001, # Coefficient for L2 regularization. Default 1. 'max_iterations': 200, # The maximum number of iterations for L-BFGS optimization. Default INT_MAX. 'feature.possible_transitions': False, # Force to generate all possible transition features. Default False. 'feature.possible_states': False, # Force to generate all possible state features. Default False. }
[docs] def legacy_tag(self, tokens): """ :param list(obj tokens) tokens: Tokens to tag :returns (list(obj), obj): ``legacy_tag`` corresponds to the ``tag`` method in ChemDataExtractor 2.0 and earlier. This has been renamed ``legacy_tag`` due to its complexity in that it could be called with either a list of strings or a list of (token, PoS tag) pairs. This made it incompatible with the new taggers in their current form. ChemDataExtractor 2.1 will call this method with a list of strings instead of a list of (token, PoS tag) pairs. This should only be used for converting previously written taggers with as few code changes as possible, as shown in the :ref:`migration guide<migration_guide_2_1>`. """ pseudo_rich_tokens = [] if len(tokens) > 0 and isinstance(tokens[0], tuple): for token in tokens: pseudo_rich_tokens.append(_CompatibilityToken(token[0], token[1])) else: pseudo_rich_tokens = tokens labelled = self.tag(pseudo_rich_tokens) processed = [] for index, element in enumerate(labelled): processed.append((tokens[index], element[1])) return processed
[docs] def tag(self, tokens): if not self._loaded_model: self.load(self.model) features = [self._get_features(tokens, i) for i in range(len(tokens))] tags = self._tagger.tag(features) tagged_sent = list(zip(tokens, tags)) return tagged_sent
def _get_features(self, tokens, i): """""" token = tokens[i].text tag = tokens[i][POS_TAG_TYPE] w = self.lexicon[token] features = [ 'w.shape=%s' % w.shape, 'w.normalized=%s' % w.normalized, 'w.lower=%s' % w.lower, 'w.length=%s' % w.length, 'w.digit_count=%s' % w.digit_count, 'w.upper_count=%s' % w.upper_count, 'w.lower_count=%s' % w.lower_count, 'w.tag=%s' % tag, ] if w.like_number: features.append('w.like_number') elif w.is_punct: features.append('w.is_punct') elif w.like_url: features.append('w.like_url') else: features.extend([ 'w.suffix1=%s' % w.lower[-1:], 'w.suffix2=%s' % w.lower[-2:], 'w.suffix3=%s' % w.lower[-3:], 'w.suffix4=%s' % w.lower[-4:], 'w.suffix5=%s' % w.lower[-5:], 'w.prefix1=%s' % w.lower[:1], 'w.prefix2=%s' % w.lower[:2], 'w.prefix3=%s' % w.lower[:3], 'w.prefix4=%s' % w.lower[:4], 'w.prefix5=%s' % w.lower[:5], ]) if w.is_alpha: features.append('w.is_alpha') elif w.is_hyphenated: features.append('w.is_hyphenated') if w.is_upper: features.append('w.is_upper') elif w.is_lower: features.append('w.is_lower') elif w.is_title: features.append('w.is_title') if self.clusters and w.cluster: features.extend([ 'w.cluster4=%s' % w.cluster[:4], 'w.cluster6=%s' % w.cluster[:6], 'w.cluster10=%s' % w.cluster[:10], 'w.cluster20=%s' % w.cluster[:20], ]) # Add features for previous tokens if present if i > 0: p1token = tokens[i-1].text p1tag = tokens[i-1][POS_TAG_TYPE] p1 = self.lexicon[p1token] features.extend([ 'p1.lower=%s' % p1.lower, 'p1.shape=%s' % p1.shape, 'p1.tag=%s' % p1tag, ]) if not (p1.like_number or p1.is_punct or p1.like_url): features.append('p1:suffix3=%s' % p1.lower[-3:]) if self.clusters and p1.cluster: features.extend([ 'p1.cluster4=%s' % p1.cluster[:4], 'p1.cluster6=%s' % p1.cluster[:6], 'p1.cluster10=%s' % p1.cluster[:10], 'p1.cluster20=%s' % p1.cluster[:20], ]) if i > 1: p2token = tokens[i-2].text p2tag = tokens[i-2][POS_TAG_TYPE] p2 = self.lexicon[p2token] features.extend([ 'p2.lower=%s' % p2.lower, 'p2.shape=%s' % p2.shape, 'p2.tag=%s' % p2tag, ]) if self.clusters and p2.cluster: features.extend([ 'p2.cluster4=%s' % p2.cluster[:4], 'p2.cluster6=%s' % p2.cluster[:6], 'p2.cluster10=%s' % p2.cluster[:10], 'p2.cluster20=%s' % p2.cluster[:20], ]) # Add features for next tokens if present end = len(tokens) - 1 if i < end: n1token = tokens[i+1].text n1tag = tokens[i+1][POS_TAG_TYPE] n1 = self.lexicon[n1token] features.extend([ 'n1.lower=%s' % n1.lower, 'n1.shape=%s' % n1.shape, 'n1.tag=%s' % n1tag, ]) if not (n1.like_number or n1.is_punct or n1.like_url): features.append('n1.suffix3=%s' % n1.lower[-3:]) if self.clusters and n1.cluster: features.extend([ 'n1.cluster4=%s' % n1.cluster[:4], 'n1.cluster6=%s' % n1.cluster[:6], 'n1.cluster10=%s' % n1.cluster[:10], 'n1.cluster20=%s' % n1.cluster[:20], ]) if i < end - 1: n2token = tokens[i+2].text n2tag = tokens[i+2][POS_TAG_TYPE] n2 = self.lexicon[n2token] features.extend([ 'n2.lower=%s' % n2.lower, 'n2.shape=%s' % n2.shape, 'n2.tag=%s' % n2tag, ]) if self.clusters and n2.cluster: features.extend([ 'n2.cluster4=%s' % n2.cluster[:4], 'n2.cluster6=%s' % n2.cluster[:6], 'n2.cluster10=%s' % n2.cluster[:10], 'n2.cluster20=%s' % n2.cluster[:20], ]) if i == 0: features.append('-firsttoken-') elif i == 1: features.append('-secondtoken-') elif i == end - 1: features.append('-secondlasttoken-') elif i == end: features.append('-lasttoken-') return features
[docs]class LegacyCemTagger(EnsembleTagger): """Return the combined output of a number of chemical entity taggers.""" #: The individual chemical entity taggers to use. label_type = NER_TAG_TYPE taggers = [CrfCemTagger(), CiDictCemTagger(), CsDictCemTagger()] # taggers = [allenwrappertagger, tokentagger, processtagger, CiDictCemTagger(), CsDictCemTagger()] lexicon = ChemLexicon() def _in_stoplist(self, entity): """Return True if the entity is in the stoplist.""" start = 0 end = len(entity) # Adjust boundaries to exclude disallowed prefixes/suffixes for prefix in IGNORE_PREFIX: if entity.startswith(prefix): # print('%s removing %s' % (currenttext, prefix)) start += len(prefix) break for suffix in IGNORE_SUFFIX: if entity.endswith(suffix): # print('%s removing %s' % (currenttext, suffix)) end -= len(suffix) break # Return True if entity has been reduced to nothing by adjusting boundaries if start >= end: return True # Return True if adjusted entity is in the literal stoplist entity = entity[start:end] if entity in STOPLIST: return True # log.debug('Entity: %s', entity) for stop_re in STOP_RES: if re.search(stop_re, entity): log.debug('Killed: %s', entity) return True
[docs] def legacy_tag(self, tokens): """ :param list(obj tokens) tokens: Tokens to tag :returns (list(obj), obj): ``legacy_tag`` corresponds to the ``tag`` method in ChemDataExtractor 2.0 and earlier. This has been renamed ``legacy_tag`` due to its complexity in that it could be called with either a list of strings or a list of (token, PoS tag) pairs. This made it incompatible with the new taggers in their current form. ChemDataExtractor 2.1 will call this method with a list of strings instead of a list of (token, PoS tag) pairs. This should only be used for converting previously written taggers with as few code changes as possible, as shown in the :ref:`migration guide<migration_guide_2_1>`. """ pseudo_rich_tokens = [] all_prefetched_tags = {} for tag_type in self.taggers_dict.keys(): if tag_type != self.tag_type: tagger = self.taggers_dict[tag_type] all_prefetched_tags[tag_type] = [el[1] for el in tagger.legacy_tag(tokens)] for index, token in enumerate(tokens): prefetched_tags = {} for key, value in all_prefetched_tags.items(): prefetched_tags[key] = value[index] pseudo_rich_tokens.append(_CompatibilityToken(token[0], token[1], prefetched_tags)) tagged = self.tag(pseudo_rich_tokens) processed = [] for index, element in enumerate(tagged): processed.append((tokens[index], element[1])) return processed
[docs] def tag(self, tokens): """Run individual chemical entity mention taggers and return union of matches, with some postprocessing.""" # print("labelling for CemTagger", tokens) # Combine output from individual taggers tags = [None] * len(tokens) for tag_type in self.taggers_dict.keys(): # tag_gen = tagger.tag(tokens) if isinstance(tagger, CrfCemTagger) else tagger.tag(just_tokens) if tag_type != self.tag_type: tagger_tags = [token[tag_type] for token in tokens] for i, newtag in enumerate(tagger_tags): if newtag == 'I-CM' and not (i == 0 or tagger_tags[i - 1] not in {'B-CM', 'I-CM'}): tags[i] = 'I-CM' # Always overwrite I-CM elif newtag == 'B-CM' and (tags[i] is None or tags[i] == "O"): tags[i] = 'B-CM' # Only overwrite B-CM over None # Postprocess the combined output for i, tag in enumerate(tags): token = tokens[i].text # pos = tokens[i][POS_TAG_TYPE] lex = self.lexicon[token] nexttag = tags[i + 1] if i < len(tags) - 1 else None # Trim disallowed first tokens if tag == 'B-CM' and lex.lower in STRIP_START: tags[i] = None if nexttag == 'I-CM': tags[i + 1] = 'B-CM' # Trim disallowed final tokens if nexttag is None and lex.lower in STRIP_END: tags[i] = None # Filter certain entities for i, tag in enumerate(tags): token = tokens[i].text # pos = tokens[i][POS_TAG_TYPE] if tag == 'B-CM': entity_tokens = [self.lexicon[token].lower] end_i = i + 1 for j, subsequent in enumerate(tags[i + 1:]): if subsequent == 'I-CM': end_i += 1 entity_tokens.append(self.lexicon[tokens[i + j + 1].text].lower) else: break # Fix combined '1H NMR' on end # TODO: Also 13C, etc.? if len(entity_tokens) > 2 and entity_tokens[-1] == 'nmr' and entity_tokens[-2] == '1h': tags[end_i - 2] = 'B-CM' tags[end_i - 1] = None entity_tokens = entity_tokens[:-2] entity = ' '.join(entity_tokens) if any(e in STOP_TOKENS for e in entity_tokens) or self._in_stoplist(entity): tags[i:end_i] = [None] * (end_i - i) else: bl = bracket_level(entity) # Try and add on brackets in neighbouring tokens if they form part of the name # TODO: Check bracket type matches before adding on if bl == 1 and len(tokens) > end_i and bracket_level(tokens[end_i].text) == -1: # print('BLADJUST: %s - %s' % (entity, tokens[end_i][0])) tags[end_i] = 'I-CM' elif bl == -1 and i > 0 and bracket_level(tokens[i - 1].text) == 1: # print('BLADJUST: %s - %s' % (tokens[i-1][0], entity)) tags[i - 1] = 'B-CM' tags[i] = 'I-CM' elif not bracket_level(entity) == 0: # Filter entities that overall don't have balanced brackets tags[i:end_i] = [None] * (end_i - i) else: # Remove bracketed alphanumeric from end if len(entity_tokens) >= 4 and entity_tokens[-1] == ')' and entity_tokens[-3] == '(': if re.match('^(\d{1,2}[A-Za-z]?|I|II|III|IV|V|VI|VII|VIII|IX)$', entity_tokens[-2]): log.debug('Removing %s from end of CEM', entity_tokens[-2]) tags[end_i - 3:end_i] = [None, None, None] tokentags = list(six.moves.zip(tokens, tags)) return tokentags