Source code for chemdataextractor.nlp.tokenize

# -*- coding: utf-8 -*-
"""
Word and sentence tokenizers.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from abc import ABCMeta, abstractmethod
from deprecation import deprecated
import logging
import re

import six

from ..text import bracket_level, GREEK
from ..data import load_model, find_data

from lxml import etree

from tokenizers import BertWordPieceTokenizer

log = logging.getLogger(__name__)


[docs]class BaseTokenizer(six.with_metaclass(ABCMeta)): """Abstract base class from which all Tokenizer classes inherit. Subclasses must implement a ``span_tokenize(text)`` method that returns a list of integer offset tuples that identify tokens in the text. """
[docs] @deprecated(deprecated_in="2.0", details="Deprecated in favour of looking at the tokens from the Sentence object.") def tokenize(self, s): """Return a list of token strings from the given sentence. :param string s: The sentence string to tokenize. :rtype: iter(str) """ return [s[start:end] for start, end in self.span_tokenize(s)]
[docs] @abstractmethod def span_tokenize(self, s): """Return a list of integer offsets that identify tokens in the given sentence. :param string s: The sentence string to tokenize. :rtype: iter(tuple(int, int)) """ return
[docs]def regex_span_tokenize(s, regex): """Return spans that identify tokens in s split using regex.""" left = 0 for m in re.finditer(regex, s, re.U): right, next = m.span() if right != 0: yield left, right left = next yield left, len(s)
[docs]class SentenceTokenizer(BaseTokenizer): """Sentence tokenizer that uses the Punkt algorithm by Kiss & Strunk (2006).""" model = 'models/punkt_english.pickle' # This is available from NLTK
[docs] def __init__(self, model=None): self.model = model if model is not None else self.model self._tokenizer = None log.debug('%s: Initializing with %s' % (self.__class__.__name__, self.model))
[docs] def get_sentences(self, text): spans = self.span_tokenize(text.text) return text._sentences_from_spans(spans)
[docs] def span_tokenize(self, s): """Return a list of integer offsets that identify sentences in the given text. :param string s: The text to tokenize into sentences. :rtype: iter(tuple(int, int)) """ if self._tokenizer is None: self._tokenizer = load_model(self.model) # for debug in tokenizer.debug_decisions(s): # log.debug(format_debug_decision(debug)) return self._tokenizer.span_tokenize(s)
[docs]class ChemSentenceTokenizer(SentenceTokenizer): """Sentence tokenizer that uses the Punkt algorithm by Kiss & Strunk (2006), trained on chemistry text.""" model = 'models/punkt_chem-1.0.pickle'
[docs]class WordTokenizer(BaseTokenizer): """Standard word tokenizer for generic English text.""" #: Split before and after these sequences, wherever they occur, unless entire token is one of these sequences SPLIT = [ '----', '––––', # \u2013 en dash '————', # \u2014 em dash '<--->', '---', '–––', # \u2013 en dash '———', # \u2014 em dash '<-->', '-->', '...', '--', '––', # \u2013 en dash '——', # \u2014 em dash '``', "''", '->', '<', '>', '–', # \u2013 en dash '—', # \u2014 em dash '―', # \u2015 horizontal bar '~', # \u007e Tilde '⁓', # \u2053 Swung dash '∼', # \u223c Tilde operator '°', # \u00b0 Degrees ';', '@', '#', '$', '£', # \u00a3 '€', # \u20ac '%', '&', '?', '!', '™', # \u2122 '®', # \u00ae '…', # \u2026 '⋯', # \u22ef Mid-line ellipsis '†', # \u2020 Dagger '‡', # \u2021 Double dagger '§', # \u00a7 Section sign '¶' # \u00b6 Pilcrow sign '≠', # \u2260 '≡', # \u2261 '≢', # \u2262 '≣', # \u2263 '≤', # \u2264 '≥', # \u2265 '≦', # \u2266 '≧', # \u2267 '≨', # \u2268 '≩', # \u2269 '≪', # \u226a '≫', # \u226b '≈', # \u2248 '=', '÷', # \u00f7 '×', # \u00d7 '→', # \u2192 '⇄', # \u21c4 '"', # \u0022 Quote mark '“', # \u201c '”', # \u201d '„', # \u201e '‟', # \u201f '‘', # \u2018 Left single quote # '’', # \u2019 Right single quote - Regularly used as an apostrophe, so don't always split '‚', # \u201a Single low quote '‛', # \u201b Single reversed quote '`', # \u0060 '´', # \u00b4 # Primes '′', # \u2032 '″', # \u2033 '‴', # \u2034 '‵', # \u2035 '‶', # \u2036 '‷', # \u2037 '⁗', # \u2057 # Brackets '(', '[', '{', '}', ']', ')', # Slashes '/', # \u002f Solidus '⁄', # \u2044 Fraction slash '∕', # \u2215 Division slash # Hyphens and Minuses # '-', # \u002d Hyphen-minus '−', # \u2212 Minus '‒', # \u2012 figure dash # '‐', # \u2010 Hyphen # '‐', # \u2010 Hyphen # '‑', # \u2011 Non-breaking hyphen '+', # \u002b Plus '±', # \u00b1 Plus/Minus ] #: Split around these sequences unless they are followed by a digit SPLIT_NO_DIGIT = [':', ','] #: Split after these sequences if they start a word SPLIT_START_WORD = ["''", "``", "'"] #: Split before these sequences if they end a word SPLIT_END_WORD = ["'s", "'m", "'d", "'ll", "'re", "'ve", "n't", "''", "'", "’s", "’m", "’d", "’ll", "’re", "’ve", "n’t", "’", "’’"] #: Don't split full stop off last token if it is one of these sequences NO_SPLIT_STOP = ['...', 'al.', 'Co.', 'Ltd.', 'Pvt.', 'A.D.', 'B.C.', 'B.V.', 'S.D.', 'U.K.', 'U.S.', 'r.t.'] #: Split these contractions at the specified index CONTRACTIONS = [("cannot", 3), ("d'ye", 1), ("d’ye", 1), ("gimme", 3), ("gonna", 3), ("gotta", 3), ("lemme", 3), ("mor'n", 3), ("mor’n", 3), ("wanna", 3), ("'tis", 2), ("'twas", 2)] #: Don't split these sequences. NO_SPLIT = { 'mm-hm', 'mm-mm', 'o-kay', 'uh-huh', 'uh-oh', 'wanna-be' } #: Don't split around hyphens with these prefixes NO_SPLIT_PREFIX = { 'e', 'a', 'u', 'x', 'agro', 'ante', 'anti', 'arch', 'be', 'bi', 'bio', 'co', 'counter', 'cross', 'cyber', 'de', 'eco', 'ex', 'extra', 'inter', 'intra', 'macro', 'mega', 'micro', 'mid', 'mini', 'multi', 'neo', 'non', 'over', 'pan', 'para', 'peri', 'post', 'pre', 'pro', 'pseudo', 'quasi', 're', 'semi', 'sub', 'super', 'tri', 'ultra', 'un', 'uni', 'vice' } #: Don't split around hyphens with these suffixes. NO_SPLIT_SUFFIX = { 'esque', 'ette', 'fest', 'fold', 'gate', 'itis', 'less', 'most', '-o-torium', 'rama', 'wise' } #: Don't split around hyphens if only these characters before or after. NO_SPLIT_CHARS = '0123456789,\'"“”„‟‘’‚‛`´′″‴‵‶‷⁗'
[docs] def __init__(self, split_last_stop=True): #: Whether to split off the final full stop (unless preceded by NO_SPLIT_STOP). Default True. self.split_last_stop = split_last_stop
def _split_span(self, span, index, length=0): """Split a span into two or three separate spans at certain indices.""" offset = span[1] + index if index < 0 else span[0] + index # log.debug([(span[0], offset), (offset, offset + length), (offset + length, span[1])]) return [(span[0], offset), (offset, offset + length), (offset + length, span[1])] def _subspan(self, s, span, nextspan, additional_regex): """Recursively subdivide spans based on a series of rules.""" text = s[span[0]:span[1]] lowertext = text.lower() # Skip if only a single character or a split sequence if span[1] - span[0] < 2 or text in self.SPLIT or text in self.SPLIT_END_WORD or text in self.SPLIT_START_WORD or lowertext in self.NO_SPLIT: return [span] # Skip if it looks like URL if text.startswith('http://') or text.startswith('ftp://') or text.startswith('www.'): return [span] # Split full stop at end of final token (allow certain characters to follow) unless ellipsis if self.split_last_stop and nextspan is None and text not in self.NO_SPLIT_STOP and not text[-3:] == '...': if text[-1] == '.': return self._split_span(span, -1) ind = text.rfind('.') if ind > -1 and all(t in '\'‘’"“”)]}' for t in text[ind + 1:]): return self._split_span(span, ind, 1) # Split off certain sequences at the end of a word for spl in self.SPLIT_END_WORD: if text.endswith(spl) and len(text) > len(spl) and text[-len(spl) - 1].isalpha(): return self._split_span(span, -len(spl), 0) # Split off certain sequences at the start of a word for spl in self.SPLIT_START_WORD: if text.startswith(spl) and len(text) > len(spl) and text[-len(spl) - 1].isalpha(): return self._split_span(span, len(spl), 0) # Split around certain sequences for spl in self.SPLIT: ind = text.find(spl) if ind > -1: return self._split_span(span, ind, len(spl)) # Split around certain sequences unless followed by a digit for spl in self.SPLIT_NO_DIGIT: ind = text.rfind(spl) if ind > -1 and (len(text) <= ind + len(spl) or not text[ind + len(spl)].isdigit()): return self._split_span(span, ind, len(spl)) # Characters to split around, but with exceptions for i, char in enumerate(text): if char == '-': before = lowertext[:i] after = lowertext[i + 1:] # By default we split on hyphens split = True if before in self.NO_SPLIT_PREFIX or after in self.NO_SPLIT_SUFFIX: split = False # Don't split if prefix or suffix in list elif not before.strip(self.NO_SPLIT_CHARS) or not after.strip(self.NO_SPLIT_CHARS): split = False # Don't split if prefix or suffix entirely consist of certain characters if split: return self._split_span(span, i, 1) # Split contraction words for contraction in self.CONTRACTIONS: if lowertext == contraction[0]: return self._split_span(span, contraction[1]) additional_regex_handled = self.handle_additional_regex(s, span, nextspan, additional_regex) if additional_regex_handled is not None: return additional_regex_handled return [span]
[docs] def get_word_tokens(self, sentence, additional_regex=None): if not additional_regex: additional_regex = self.get_additional_regex(sentence) return sentence._tokens_for_spans(self.span_tokenize(sentence.text, additional_regex))
[docs] def get_additional_regex(self, sentence): """ Any additional regex to further split the tokens. These regular expressions may be supplied by the sentence contexually and on the fly. For example, a sentence may have certain models associated with it and dimensions associated with these models. These dimensions can inform the tokenizer what to do with high confidence; for example, if given a string like "12K", then if a temperature is desired, then the tokenizer will automatically split this given the information provided. :param chemdataextractor.doc.text.Sentence sentence: The sentence for which to get additional regex :returns: Expression to further split the tokens :rtype: re.expression """ return None
[docs] def handle_additional_regex(self, s, span, nextspan, additional_regex): text = s[span[0]:span[1]] if additional_regex: for regex in additional_regex: split_text = regex.search(text) if split_text: groups = split_text.groupdict() for group_name, group in six.iteritems(groups): if group is not None: group_length = len(group) if 'split' in group_name and group_length != 0: return self._split_span(span, group_length, 0) return None
[docs] def span_tokenize(self, s, additional_regex=None): """""" # First get spans by splitting on all whitespace # Includes: \u0020 \u00A0 \u1680 \u180E \u2000 \u2001 \u2002 \u2003 \u2004 \u2005 \u2006 \u2007 \u2008 \u2009 \u200A \u202F \u205F \u3000 spans = [(left, right) for left, right in regex_span_tokenize(s, '\s+') if not left == right] i = 0 # Recursively split spans according to rules while i < len(spans): subspans = self._subspan(s, spans[i], spans[i + 1] if i + 1 < len(spans) else None, additional_regex) spans[i:i + 1] = [subspan for subspan in subspans if subspan[1] - subspan[0] > 0] if len(subspans) == 1: i += 1 return spans
[docs]class ChemWordTokenizer(WordTokenizer): """Word Tokenizer for chemistry text.""" #: Split before and after these sequences, wherever they occur, unless entire token is one of these sequences SPLIT = [ '----', '––––', # \u2013 en dash '————', # \u2014 em dash '<--->', '---', '–––', # \u2013 en dash '———', # \u2014 em dash '<-->', '-->', '...', '--', '––', # \u2013 en dash '——', # \u2014 em dash # '``', # "''", # '->', # Don't split around this if occurs within chemical name '<', # '>', # Don't split around this if occurs within chemical name ').', # Fix missing whitespace errors '.(', # Fix missing whitespace errors '–', # \u2013 en dash '—', # \u2014 em dash '―', # \u2015 horizontal bar '~', # \u007e Tilde '⁓', # \u2053 Swung dash '∼', # \u223c Tilde operator '°', # \u00b0 Degrees # ';', '@', '#', '$', '£', # \u00a3 '€', # \u20ac '%', '&', '?', '!', '™', # \u2122 '®', # \u00ae '…', # \u2026 '⋯', # \u22ef Mid-line ellipsis '†', # \u2020 Dagger '‡', # \u2021 Double dagger '§', # \u00a7 Section sign '¶' # \u00b6 Pilcrow sign '≠', # \u2260 '≡', # \u2261 '≢', # \u2262 '≣', # \u2263 '≤', # \u2264 '≥', # \u2265 '≦', # \u2266 '≧', # \u2267 '≨', # \u2268 '≩', # \u2269 '≪', # \u226a '≫', # \u226b '≈', # \u2248 '=', '÷', # \u00f7 '×', # \u00d7 # '→', # \u2192 # Don't split around this if occurs within chemical name '⇄', # \u21c4 '"', # \u0022 Quote mark '“', # \u201c '”', # \u201d '„', # \u201e '‟', # \u201f '‘', # \u2018 Left single quote # '’', # \u2019 Right single quote - Regularly used as an apostrophe, so don't always split '‚', # \u201a Single low quote '‛', # \u201b Single reversed quote '`', # \u0060 '´', # \u00b4 # Primes # '′', # \u2032 # '″', # \u2033 # '‴', # \u2034 # '‵', # \u2035 # '‶', # \u2036 # '‷', # \u2037 # '⁗', # \u2057 # Brackets # '(', # '[', # '{', # '}', # ']', # ')', # Slashes # '/', # \u002f Solidus # '⁄', # \u2044 Fraction slash # '∕', # \u2215 Division slash # Hyphens and Minuses # '-', # \u002d Hyphen-minus # '−', # \u2212 Minus # '‒', # \u2012 figure dash # '‐', # \u2010 Hyphen # '‑', # \u2011 Non-breaking hyphen # '+', # \u002b Plus # '±', # \u00b1 Plus/Minus ] #: Split before these sequences if they end a token SPLIT_END = [':', ',', '(TM)', '(R)', '(®)', '(™)', '(■)', '(◼)', '(●)', '(▲)', '(○)', '(◆)', '(▼)', '(⧫)', '(△)', '(◇)', '(▽)', '(⬚)', '(×)', '(□)', '(•)', '’', '°C'] #: Split before these sequences if they end a token, unless preceded by a digit SPLIT_END_NO_DIGIT = ['(aq)', '(aq.)', '(s)', '(l)', '(g)'] #: Don't split around slash when both preceded and followed by these characters NO_SPLIT_SLASH = ['+', '-', '−'] #: Regular expression that matches a numeric quantity with units QUANTITY_RE = re.compile(r'^((?P<split>\d\d\d)g|(?P<_split1>[-−]?\d+\.\d+|10[-−]\d+)(g|s|m|N|V)([-−]?[1-4])?|(?P<_split2>\d*[-−]?\d+\.?\d*)([pnµμm]A|[µμmk]g|[kM]J|m[lL]|[nµμm]?M|[nµμmc]m|kN|[mk]V|[mkMG]?W|[mnpμµ]s|Hz|[Mm][Oo][Ll](e|ar)?s?|k?Pa|ppm|min)([-−]?[1-4])?)$') #: Don't split on hyphen if the prefix matches this regular expression NO_SPLIT_PREFIX_ENDING = re.compile('(^\(.*\)|^[\d,\'"“”„‟‘’‚‛`´′″‴‵‶‷⁗Α-Ωα-ω]+|ano|ato|azo|boc|bromo|cbz|chloro|eno|fluoro|fmoc|ido|ino|io|iodo|mercapto|nitro|ono|oso|oxalo|oxo|oxy|phospho|telluro|tms|yl|ylen|ylene|yliden|ylidene|ylidyn|ylidyne)$', re.U) #: Don't split on hyphen if prefix or suffix match this regular expression NO_SPLIT_CHEM = re.compile('([\-α-ω]|\d+,\d+|\d+[A-Z]|^d\d\d?$|acetic|acetyl|acid|acyl|anol|azo|benz|bromo|carb|cbz|chlor|cyclo|ethan|ethyl|fluoro|fmoc|gluc|hydro|idyl|indol|iene|ione|iodo|mercapto|n,n|nitro|noic|o,o|oxalo|oxo|oxy|oyl|onyl|phen|phth|phospho|pyrid|telluro|tetra|tms|ylen|yli|zole|alpha|beta|gamma|delta|epsilon|theta|kappa|lambda|sigma|omega)', re.U | re.I) #: Don't split on hyphen if the prefix is one of these sequences NO_SPLIT_PREFIX = { 'e', 'a', 'u', 'x', 'agro', 'ante', 'anti', 'arch', 'be', 'bi', 'bio', 'co', 'counter', 'cross', 'cyber', 'de', 'eco', 'ex', 'extra', 'inter', 'intra', 'macro', 'mega', 'micro', 'mid', 'mini', 'multi', 'neo', 'non', 'over', 'pan', 'para', 'peri', 'post', 'pre', 'pro', 'pseudo', 'quasi', 're', 'semi', 'sub', 'super', 'tri', 'ultra', 'un', 'uni', 'vice', 'aci', 'adeno', 'aldehydo', 'allo', 'alpha', 'altro', 'ambi', 'aorto', 'arachno', 'as', 'beta', 'bis', 'catena', 'centi', 'chi', 'chiro', 'circum', 'cis', 'closo', 'colo', 'conjuncto', 'conta', 'contra', 'cortico', 'cosa', 'counter', 'cran', 'crypto', 'cyclo', 'deca', 'deci', 'delta', 'demi', 'di', 'dis', 'dl', 'eco', 'electro', 'endo', 'ennea', 'ent', 'epi', 'epsilon', 'erythro', 'eta', 'exo', 'ferro', 'galacto', 'gamma', 'gastro', 'giga', 'gluco', 'glycero', 'graft', 'gulo', 'hemi', 'hepta', 'hexa', 'homo', 'hydro', 'hypho', 'hypo', 'ideo', 'idio', 'in', 'infra', 'iota', 'iso', 'judeo', 'kappa', 'keto', 'kis', 'lambda', 'lyxo', 'manno', 'medi', 'meso', 'meta', 'milli', 'mono', 'mu', 'muco', 'musculo', 'myo', 'nano', 'neuro', 'nido', 'nitro', 'nona', 'nor', 'novem', 'novi', 'nu', 'octa', 'octi', 'octo', 'omega', 'omicron', 'ortho', 'paleo', 'pelvi', 'penta', 'pheno', 'phi', 'pi', 'pica', 'pneumo', 'poly', 'preter', 'psi', 'quadri', 'quater', 'quinque', 'recto', 'rho', 'ribo', 'salpingo', 'scyllo', 'sec', 'sept', 'septi', 'sero', 'sesqui', 'sexi', 'sigma', 'sn', 'soci', 'supra', 'sur', 'sym', 'syn', 'talo', 'tau', 'tele', 'ter', 'tera', 'tert', 'tetra', 'theta', 'threo', 'trans', 'triangulo', 'tris', 'uber', 'unsym', 'upsilon', 'veno', 'ventriculo', 'xi', 'xylo', 'zeta', } #: Split on hyphens followed by one of these sequences SPLIT_SUFFIX = { 'absorption', 'abstinent', 'abstraction', 'abuse', 'accelerated', 'accepting', 'acclimated', 'acclimation', 'acid', 'activated', 'activation', 'active', 'activity', 'addition', 'adducted', 'adducts', 'adequate', 'adjusted', 'administrated', 'adsorption', 'affected', 'aged', 'alcohol', 'alcoholic', 'algae', 'alginate', 'alkaline', 'alkylated', 'alkylation', 'alkyne', 'analogous', 'anesthetized', 'appended', 'armed', 'aromatic', 'assay', 'assemblages', 'assisted', 'associated', 'atom', 'atoms', 'attenuated', 'attributed', 'backbone', 'base', 'based', 'bearing', 'benzylation', 'binding', 'biomolecule', 'biotic', 'blocking', 'blood', 'bond', 'bonded', 'bonding', 'bonds', 'boosted', 'bottle', 'bottled', 'bound', 'bridge', 'bridged', 'buffer', 'buffered', 'caged', 'cane', 'capped', 'capturing', 'carrier', 'carrying', 'catalysed', 'catalyzed', 'cation', 'caused', 'centered', 'challenged', 'chelating', 'cleaving', 'coated', 'coating', 'coenzyme', 'competing', 'competitive', 'complex', 'complexes', 'compound', 'compounds', 'concentration', 'conditioned', 'conditions', 'conducting', 'configuration', 'confirmed', 'conjugate', 'conjugated', 'conjugates', 'connectivity', 'consuming', 'contained', 'containing', 'contaminated', 'control', 'converting', 'coordinate', 'coordinated', 'copolymer', 'copolymers', 'core', 'cored', 'cotransport', 'coupled', 'covered', 'crosslinked', 'cyclized', 'damaged', 'dealkylation', 'decocted', 'decorated', 'deethylation', 'deficiency', 'deficient', 'defined', 'degrading', 'demethylated', 'demethylation', 'dendrimer', 'density', 'dependant', 'dependence', 'dependent', 'deplete', 'depleted', 'depleting', 'depletion', 'depolarization', 'depolarized', 'deprived', 'derivatised', 'derivative', 'derivatives', 'derivatized', 'derived', 'desorption', 'detected', 'devalued', 'dextran', 'dextrans', 'diabetic', 'dimensional', 'dimer', 'distribution', 'divalent', 'domain', 'dominated', 'donating', 'donor', 'dopant', 'doped', 'doping', 'dosed', 'dot', 'drinking', 'driven', 'drug', 'drugs', 'dye', 'edge', 'efficiency', 'electrodeposited', 'electrolyte', 'elevating', 'elicited', 'embedded', 'emersion', 'emitting', 'encapsulated', 'encapsulating', 'enclosed', 'enhanced', 'enhancing', 'enriched', 'enrichment', 'enzyme', 'epidermal', 'equivalents', 'etched', 'ethanolamine', 'evoked', 'exchange', 'excimer', 'excluder', 'expanded', 'experimental', 'exposed', 'exposure', 'expressing', 'extract', 'extraction', 'fed', 'finger', 'fixed', 'fixing', 'flanking', 'flavonoid', 'fluorescence', 'formation', 'forming', 'fortified', 'free', 'function', 'functionalised', 'functionalized', 'functionalyzed', 'fused', 'gas', 'gated', 'generating', 'glucuronidating', 'glycoprotein', 'glycosylated', 'glycosylation', 'gradient', 'grafted', 'group', 'groups', 'halogen', 'heterocyclic', 'homologues', 'hydrogel', 'hydrolyzing', 'hydroxylated', 'hydroxylation', 'hydroxysteroid', 'immersion', 'immobilized', 'immunoproteins', 'impregnated', 'imprinted', 'inactivated', 'increased', 'increasing', 'incubated', 'independent', 'induce', 'induced', 'inducible', 'inducing', 'induction', 'influx', 'inhibited', 'inhibitor', 'inhibitory', 'initiated', 'injected', 'insensitive', 'insulin', 'integrated', 'interlinked', 'intermediate', 'intolerant', 'intoxicated', 'ion', 'ions', 'island', 'isomer', 'isomers', 'knot', 'label', 'labeled', 'labeling', 'labelled', 'laden', 'lamp', 'laser', 'layer', 'layers', 'lesioned', 'ligand', 'ligated', 'like', 'limitation', 'limited', 'limiting', 'lined', 'linked', 'linker', 'lipid', 'lipids', 'lipoprotein', 'liposomal', 'liposomes', 'liquid', 'liver', 'loaded', 'loading', 'locked', 'loss', 'lowering', 'lubricants', 'luminance', 'luminescence', 'maintained', 'majority', 'making', 'mannosylated', 'material', 'mediated', 'metabolizing', 'metal', 'metallized', 'methylation', 'migrated', 'mimetic', 'mimicking', 'mixed', 'mixture', 'mode', 'model', 'modified', 'modifying', 'modulated', 'moiety', 'molecule', 'monoadducts', 'monomer', 'mutated', 'nanogel', 'nanoparticle', 'nanotube', 'need', 'negative', 'nitrosated', 'nitrosation', 'nitrosylation', 'nmr', 'noncompetitive', 'normalized', 'nuclear', 'nucleoside', 'nucleosides', 'nucleotide', 'nucleotides', 'nutrition', 'olefin', 'olefins', 'oligomers', 'omitted', 'only', 'outcome', 'overload', 'oxidation', 'oxidized', 'oxo-mediated', 'oxygenation', 'page', 'paired', 'pathway', 'patterned', 'peptide', 'permeabilized', 'permeable', 'phase', 'phospholipids', 'phosphopeptide', 'phosphorylated', 'pillared', 'placebo', 'planted', 'plasma', 'polymer', 'polymers', 'poor', 'porous', 'position', 'positive', 'postlabeling', 'precipitated', 'preferring', 'pretreated', 'primed', 'produced', 'producing', 'production', 'promoted', 'promoting', 'protected', 'protein', 'proteomic', 'protonated', 'provoked', 'purified', 'radical', 'reacting', 'reaction', 'reactive', 'reagents', 'rearranged', 'receptor', 'receptors', 'recognition', 'redistribution', 'redox', 'reduced', 'reducing', 'reduction', 'refractory', 'refreshed', 'regenerating', 'regulated', 'regulating', 'regulatory', 'related', 'release', 'releasing', 'replete', 'requiring', 'resistance', 'resistant', 'resitant', 'response', 'responsive', 'responsiveness', 'restricted', 'resulted', 'retinal', 'reversible', 'ribosylated', 'ribosylating', 'ribosylation', 'rich', 'right', 'ring', 'saturated', 'scanning', 'scavengers', 'scavenging', 'sealed', 'secreting', 'secretion', 'seeking', 'selective', 'selectivity', 'semiconductor', 'sensing', 'sensitive', 'sensitized', 'soluble', 'solution', 'solvent', 'sparing', 'specific', 'spiked', 'stabilised', 'stabilized', 'stabilizing', 'stable', 'stained', 'steroidal', 'stimulated', 'stimulating', 'storage', 'stressed', 'stripped', 'substituent', 'substituted', 'substitution', 'substrate', 'sufficient', 'sugar', 'sugars', 'supplemented', 'supported', 'suppressed', 'surface', 'susceptible', 'sweetened', 'synthesizing', 'tagged', 'target', 'telopeptide', 'terminal', 'terminally', 'terminated', 'termini', 'terminus', 'ternary', 'terpolymer', 'tertiary', 'tested', 'testes', 'tethered', 'tetrabrominated', 'tolerance', 'tolerant', 'toxicity', 'toxin', 'tracer', 'transfected', 'transfer', 'transition', 'transport', 'transporter', 'treated', 'treating', 'treatment', 'triggered', 'turn', 'type', 'unesterified', 'untreated', 'vacancies', 'vacancy', 'variable', 'water', 'yeast', 'yield', 'zwitterion' } NO_SPLIT = {'°c'}
[docs] def get_additional_regex(self, sentence): additional_regex = [self.QUANTITY_RE] quantity_re = sentence.quantity_re if quantity_re: additional_regex.append(quantity_re) # print('quantity re added') return additional_regex
def _closing_bracket_index(self, text, bpair=('(', ')')): """Return the index of the closing bracket that matches the opening bracket at the start of the text.""" level = 1 for i, char in enumerate(text[1:]): if char == bpair[0]: level += 1 elif char == bpair[1]: level -= 1 if level == 0: return i + 1 def _opening_bracket_index(self, text, bpair=('(', ')')): """Return the index of the opening bracket that matches the closing bracket at the end of the text.""" level = 1 for i, char in enumerate(reversed(text[:-1])): if char == bpair[1]: level += 1 elif char == bpair[0]: level -= 1 if level == 0: return len(text) - i - 2 def _is_number(self, text): """Return True if the text is a number.""" try: float(text) return True except ValueError: return False def _is_saccharide_arrow(self, before, after): """Return True if the arrow is in a chemical name.""" if (before and after and before[-1].isdigit() and after[0].isdigit() and before.rstrip('0123456789').endswith('(') and after.lstrip('0123456789').startswith(')-')): return True else: return False def _subspan(self, s, span, nextspan, additional_regex): if additional_regex is None: additional_regex = [self.QUANTITY_RE] """Recursively subdivide spans based on a series of rules.""" text = s[span[0]:span[1]] lowertext = text.lower() # Skip if only a single character or a split sequence if span[1] - span[0] < 2 or text in self.SPLIT or text in self.SPLIT_END_WORD or text in self.SPLIT_START_WORD or lowertext in self.NO_SPLIT: return [span] # Skip if it looks like URL if text.startswith('http://') or text.startswith('ftp://') or text.startswith('www.'): return [span] # Split full stop at end of final token (allow certain characters to follow) unless ellipsis if self.split_last_stop and nextspan is None and text not in self.NO_SPLIT_STOP and not text[-3:] == '...': if text[-1] == '.': return self._split_span(span, -1) ind = text.rfind('.') if ind > -1 and all(t in '\'‘’"“”)]}' for t in text[ind + 1:]): return self._split_span(span, ind, 1) # Split off certain sequences at the end of a token for spl in self.SPLIT_END: if text.endswith(spl) and len(text) > len(spl): return self._split_span(span, -len(spl), 0) # Split off certain sequences at the end of a word for spl in self.SPLIT_END_WORD: if text.endswith(spl) and len(text) > len(spl) and text[-len(spl) - 1].isalpha(): return self._split_span(span, -len(spl), 0) # Split off certain sequences at the end of a word for spl in self.SPLIT_START_WORD: if text.startswith(spl) and len(text) > len(spl) and text[-len(spl) - 1].isalpha(): return self._split_span(span, len(spl), 0) # Split around certain sequences for spl in self.SPLIT: ind = text.find(spl) if ind > -1: return self._split_span(span, ind, len(spl)) # Split around certain sequences unless followed by a digit # - We skip this because of difficulty with chemical names. # for spl in self.SPLIT_NO_DIGIT: # ind = text.rfind(spl) # if ind > -1 and (len(text) <= ind + len(spl) or not text[ind + len(spl)].isdigit()): # return self._split_span(span, ind, len(spl)) # Split off certain sequences at the end of a token unless preceded by a digit for spl in self.SPLIT_END_NO_DIGIT: if text.endswith(spl) and len(text) > len(spl) and not text[-len(spl) - 1].isdigit(): return self._split_span(span, -len(spl), 0) # Regular Bracket at both start and end, break off both provided they correspond if text.startswith('(') and text.endswith(')') and self._closing_bracket_index(text) == len(text) - 1: return self._split_span(span, 1, len(text)-2) # Split things like IR(KBr) if text.startswith('IR(') and text.endswith(')'): return self._split_span(span, 2, 1) # Split things like \d+\.\d+([a-z]+) e.g. UV-vis/IR peaks with bracketed strength/shape m = re.match('^(\d+\.\d+|\d{3,})(\([a-z]+\))$', text, re.I) if m: return self._split_span(span, m.start(2), 1) # Split brackets off start and end if the corresponding bracket isn't within token for bpair in [('(', ')'), ('{', '}'), ('[', ']')]: #level = bracket_level(text, open=[bpair[0]], close=[bpair[1]]) # Bracket at start, bracketlevel > 0, break it off if text.startswith(bpair[0]) and self._closing_bracket_index(text, bpair=bpair) is None: return self._split_span(span, 1, 0) # Bracket at end, bracketlevel < 0, break it off if text.endswith(bpair[1]) and self._opening_bracket_index(text, bpair=bpair) is None: return self._split_span(span, -1, 0) # TODO: Consider splitting around comma in limited circumstances. Mainly to fix whitespace errors. # Characters to split around, but with exceptions for i, char in enumerate(text): before = text[:i] after = text[i+1:] if char in {':', ';'}: # Split around colon unless it looks like we're in a chemical name if not (before and after and after[0].isdigit() and before.rstrip('′\'')[-1:].isdigit() and '-' in after) and not (self.NO_SPLIT_CHEM.search(before) and self.NO_SPLIT_CHEM.search(after)): return self._split_span(span, i, 1) elif char in {'x', '+', '−'}: # Split around x, +, − (\u2212 minus) between two numbers or at start followed by numbers if (i == 0 or self._is_number(before)) and self._is_number(after): return self._split_span(span, i, 1) # Also plit around − (\u2212 minus) between two letters if char == '−' and before and before[-1].isalpha() and after and after[0].isalpha(): return self._split_span(span, i, 1) elif char == '±': # Split around ± unless surrounded by brackets if not (before and after and before[-1] == '(' and after[0] == ')'): return self._split_span(span, i, 1) elif char == '/': # Split around / unless '+/-' or '-/-' etc. if not (before and after and before[-1] in self.NO_SPLIT_SLASH and after[0] in self.NO_SPLIT_SLASH): return self._split_span(span, i, 1) elif char == '>': if not (before and before[-1] == '-'): # Split if preceding is not - return self._split_span(span, i, 1) if before and before[-1] == '-': # If preceding is -, split around -> unless in chemical name if not text == '->' and not self._is_saccharide_arrow(before[:-1], after): return self._split_span(span, i-1, 2) elif char is '→' and not self._is_saccharide_arrow(before, after): # TODO: 'is' should be '=='... this never splits!? # Split around → unless in chemical name return self._split_span(span, i, 1) elif char == '(' and self._is_number(before) and not '(' in after and not ')' in after: # Split around open bracket after a number return self._split_span(span, i, 1) elif char == '-': lowerbefore = lowertext[:i] lowerafter = lowertext[i+1:] # Always split on -of-the- -to- -in- -by- -of- -or- -and- -per- -the- if lowerafter[:7] == 'of-the-': return [(span[0], span[0] + i), (span[0] + i, span[0] + i + 1), (span[0] + i + 1, span[0] + i + 3), (span[0] + i + 3, span[0] + i + 4), (span[0] + i + 4, span[0] + i + 7), (span[0] + i + 7, span[0] + i + 8), (span[0] + i + 8, span[1])] if lowerafter[:5] in {'on-a-', 'of-a-'}: return [(span[0], span[0] + i), (span[0] + i, span[0] + i + 1), (span[0] + i + 1, span[0] + i + 3), (span[0] + i + 3, span[0] + i + 4), (span[0] + i + 4, span[0] + i + 5), (span[0] + i + 5, span[0] + i + 6), (span[0] + i + 6, span[1])] if lowerafter[:3] in {'to-', 'in-', 'by-', 'of-', 'or-', 'on-'}: return [(span[0], span[0] + i), (span[0] + i, span[0] + i + 1), (span[0] + i + 1, span[0] + i + 3), (span[0] + i + 3, span[0] + i + 4), (span[0] + i + 4, span[1])] if lowerafter[:4] in {'and-', 'per-', 'the-'}: return [(span[0], span[0] + i), (span[0] + i, span[0] + i + 1), (span[0] + i + 1, span[0] + i + 4), (span[0] + i + 4, span[0] + i + 5), (span[0] + i + 5, span[1])] # By default we split on hyphens split = True if lowerafter == 'nmr': split = True # Always split NMR off end elif bracket_level(text) == 0 and (not bracket_level(after) == 0 or not bracket_level(before) == 0): split = False # Don't split if within brackets elif after and after[0] == '>': split = False # Don't split if followed by > elif lowerbefore in self.NO_SPLIT_PREFIX or lowerafter in self.NO_SPLIT_SUFFIX: split = False # Don't split if prefix or suffix in list elif self.NO_SPLIT_PREFIX_ENDING.search(lowerbefore): split = False # Don't split if prefix ends with pattern elif lowerafter in self.SPLIT_SUFFIX: split = True # Do split if suffix in list elif len(before) <= 1 or len(after) <= 2: split = False # Don't split if not at least 2 char before and 3 after elif self.NO_SPLIT_CHEM.search(lowerbefore) or self.NO_SPLIT_CHEM.search(lowerafter): split = False # Don't split if prefix or suffix match chem regex if split: return self._split_span(span, i, 1) # TODO: Errors: # [³H]-choline # S,S'-... # 1,4-di-substituted # 11-β - hydroxysteroid # Spelt out greek: 11beta - hydroxysteroid # ...-N-substituted like 2,5-dimethyl-N-substituted pyrroles # 4-(2-Butyl-6,7-dichloro-2-cyclopentyl-indan-1-on-5-yl) oxobutyric acid # Adenosine - monophosphate # Consistency for amino acids: Arg-Arg and Arg-Arg-Asp... probably always split # D,L-α-peptide? # N'-formylkynurenine # poly(D,L-lactic acid )? # poly(methyl metha-acrylate )? # Poly(N - alkyl Acrylamide ) # poly(N - isopropyl acrylamide ) # R,S - lorazepam # S,S - dioxide # Split units off the end of a numeric value # quantity = self.QUANTITY_RE.search(text) # if quantity: # return self._split_span(span, len(quantity.group(6) or quantity.group(3) or quantity.group(2)), 0) # Split pH off the start of a numeric value if text.startswith('pH') and self._is_number(text[2:]): return self._split_span(span, 2, 0) # Split contraction words for contraction in self.CONTRACTIONS: if lowertext == contraction[0]: return self._split_span(span, contraction[1]) additional_regex_handled = self.handle_additional_regex(s, span, nextspan, additional_regex) if additional_regex_handled is not None: return additional_regex_handled if nextspan: nexttext = s[nextspan[0]:nextspan[1]] # Split NMR isotope whitespace errors (joined with previous sentence full stop) if nexttext == 'NMR': ind = text.rfind('.') if ind > -1 and text[ind + 1:] in {'1H', '13C', '15N', '31P', '19F', '11B', '29Si', '170', '73Ge', '195Pt', '33S', '13C{1H}'}: return self._split_span(span, ind, 1) return [span]
[docs]class FineWordTokenizer(WordTokenizer): """Word Tokenizer that also split around hyphens and all colons.""" #: Split before and after these sequences, wherever they occur, unless entire token is one of these sequences SPLIT = [ '----', '––––', # \u2013 en dash '————', # \u2014 em dash '<--->', '---', '–––', # \u2013 en dash '———', # \u2014 em dash '<-->', '-->', '...', '--', '––', # \u2013 en dash '——', # \u2014 em dash '``', "''", '->', '<', '>', '–', # \u2013 en dash '—', # \u2014 em dash '―', # \u2015 horizontal bar '~', # \u007e Tilde '⁓', # \u2053 Swung dash '∼', # \u223c Tilde operator '°', # \u00b0 Degrees ';', '@', '#', '$', '£', # \u00a3 '€', # \u20ac '%', '&', '?', '!', '™', # \u2122 '®', # \u00ae '…', # \u2026 '⋯', # \u22ef Mid-line ellipsis '†', # \u2020 Dagger '‡', # \u2021 Double dagger '§', # \u00a7 Section sign '¶' # \u00b6 Pilcrow sign '≠', # \u2260 '≡', # \u2261 '≢', # \u2262 '≣', # \u2263 '≤', # \u2264 '≥', # \u2265 '≦', # \u2266 '≧', # \u2267 '≨', # \u2268 '≩', # \u2269 '≪', # \u226a '≫', # \u226b '≈', # \u2248 '=', '÷', # \u00f7 '×', # \u00d7 '→', # \u2192 '⇄', # \u21c4 '"', # \u0022 Quote mark '“', # \u201c '”', # \u201d '„', # \u201e '‟', # \u201f '‘', # \u2018 Left single quote '’', # \u2019 Right single quote '‚', # \u201a Single low quote '‛', # \u201b Single reversed quote '`', # \u0060 '´', # \u00b4 # Primes '′', # \u2032 '″', # \u2033 '‴', # \u2034 '‵', # \u2035 '‶', # \u2036 '‷', # \u2037 '⁗', # \u2057 # Brackets '(', '[', '{', '}', ']', ')', # Slashes '/', # \u002f Solidus '⁄', # \u2044 Fraction slash '∕', # \u2215 Division slash # Hyphens and Minuses '-', # \u002d Hyphen-minus '−', # \u2212 Minus '‒', # \u2012 figure dash '‐', # \u2010 Hyphen '‑', # \u2011 Non-breaking hyphen '+', # \u002b Plus '±', # \u00b1 Plus/Minus ':', ] #: Split before these sequences if they end a token SPLIT_NO_DIGIT = [','] NO_SPLIT = {} #: Don't split around hyphens with these prefixes NO_SPLIT_PREFIX = {} #: Don't split around hyphens with these suffixes. NO_SPLIT_SUFFIX = {} def _subspan(self, s, span, nextspan, additional_regex): """Recursively subdivide spans based on a series of rules.""" # Split on boundaries between greek and non-greek text = s[span[0]:span[1]] for i, char in enumerate(text): if i < len(text) - 1: nextchar = text[i + 1] if (char in GREEK and nextchar not in GREEK) or (char not in GREEK and nextchar in GREEK): return [(span[0], span[0] + i + 1), (span[0] + i + 1, span[1])] # Perform all normal WordTokenizer splits return super(FineWordTokenizer, self)._subspan(s, span, nextspan, additional_regex)
[docs]class BertWordTokenizer(ChemWordTokenizer): """ A word tokenizer for BERT with some additional allowances in case one wants to override its choices. Concrete overrides that are used in CDE include not splitting if it seems like a decimal point is in the middle of a number, and splitting values and units. """ do_not_split = [] do_not_split_if_in_num = [".", ","]
[docs] def __init__(self, split_last_stop=True, path=None, lowercase=True): super().__init__(split_last_stop) if path is None: path = find_data('models/scibert_uncased_vocab-1.0.txt') self.tokenizer = BertWordPieceTokenizer(path, lowercase=lowercase)
[docs] def span_tokenize(self, s, additional_regex=None): output = self.tokenizer.encode(str(s)) offsets = output.offsets[1: -1] given_tokens = output.tokens[1: -1] current_span = (0, 0) spans = [] i = 0 zipped = [el for el in zip(offsets, given_tokens)] while i < len(zipped): offset, token = zipped[i] # If symbol is in do_not_split and it's part of a word, i.e. it's not surrounded # by whitespace, then don't split it if (s[offset[0]: offset[1]] in self.do_not_split_if_in_num and offset[0] == current_span[1] and i < len(zipped) - 1 and zipped[i + 1][0][0] == offset[1] and re.match("\d+$", s[zipped[i + 1][0][0]: zipped[i + 1][0][1]])): i += 1 offset, token = zipped[i] current_span = (current_span[0], offset[1]) # If symbol is in do_not_split and it's part of a word, i.e. it's not surrounded # by whitespace, then don't split it elif (s[offset[0]: offset[1]] in self.do_not_split and offset[0] == current_span[1] and i < len(zipped) - 1 and zipped[i + 1][0][0] == offset[1]): i += 1 offset, token = zipped[i] current_span = (current_span[0], offset[1]) # Prevent splitting of negative numbers, but allow splitting of ranges such as 0.5-1.0 # and cases like 5-Bromo-6-Penda... elif (s[offset[0]: offset[1]] == "-" and i < len(zipped) - 1 and zipped[i + 1][0][0] == offset[1] and re.match(r"\d+$", s[zipped[i + 1][0][0]: zipped[i + 1][0][1]]) and (i == 0 or not (zipped[i - 1][0][1] == offset[0] and re.match(r"\d+$", s[zipped[i - 1][0][0]: zipped[i - 1][0][1]]))) and (i >= len(zipped) - 2 or not (zipped[i + 2][0][0] == zipped[i + 1][0][1] and s[zipped[i + 2][0][0]: zipped[i + 2][0][1]] == "-"))): i += 1 if current_span != (0, 0): spans.append(current_span) current_span = offset offset, token = zipped[i] current_span = (current_span[0], offset[1]) # If the token is a subword, as defined by BERT, then merge it with the previous token elif len(token) > 2 and token[:2] == "##": current_span = (current_span[0], offset[1]) # Otherwise, split it else: if current_span != (0, 0): spans.append(current_span) current_span = offset i += 1 spans.append(current_span) # Perform additional tokenisation as required by the additional regex if additional_regex is not None: i = 0 while i < len(spans): subspans = self.handle_additional_regex(s, spans[i], spans[i + 1] if i + 1 < len(spans) else None, additional_regex) if subspans is None: subspans = [spans[i]] spans[i:i + 1] = [subspan for subspan in subspans if subspan[1] - subspan[0] > 0] if len(subspans) == 1: i += 1 return spans