Source code for chemdataextractor.nlp.tokenize

# -*- coding: utf-8 -*-
"""
Word and sentence tokenizers.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from abc import ABCMeta, abstractmethod
from deprecation import deprecated
import logging
import re

import six

from ..text import bracket_level, GREEK
from ..data import load_model, find_data

from lxml import etree

from tokenizers import BertWordPieceTokenizer

log = logging.getLogger(__name__)


[docs]class BaseTokenizer(six.with_metaclass(ABCMeta)):
    """Abstract base class from which all Tokenizer classes inherit.

    Subclasses must implement a ``span_tokenize(text)`` method that returns a list of integer offset tuples that
    identify tokens in the text.

    """

[docs]    @deprecated(deprecated_in="2.0", details="Deprecated in favour of looking at the tokens from the Sentence object.")
    def tokenize(self, s):
        """Return a list of token strings from the given sentence.

        :param string s: The sentence string to tokenize.
        :rtype: iter(str)
        """
        return [s[start:end] for start, end in self.span_tokenize(s)]

[docs]    @abstractmethod
    def span_tokenize(self, s):
        """Return a list of integer offsets that identify tokens in the given sentence.

        :param string s: The sentence string to tokenize.
        :rtype: iter(tuple(int, int))
        """
        return


[docs]def regex_span_tokenize(s, regex):
    """Return spans that identify tokens in s split using regex."""
    left = 0
    for m in re.finditer(regex, s, re.U):
        right, next = m.span()
        if right != 0:
            yield left, right
        left = next
    yield left, len(s)


[docs]class SentenceTokenizer(BaseTokenizer):
    """Sentence tokenizer that uses the Punkt algorithm by Kiss & Strunk (2006)."""

    model = 'models/punkt_english.pickle'  # This is available from NLTK

[docs]    def __init__(self, model=None):
        self.model = model if model is not None else self.model
        self._tokenizer = None
        log.debug('%s: Initializing with %s' % (self.__class__.__name__, self.model))

[docs]    def get_sentences(self, text):
        spans = self.span_tokenize(text.text)
        return text._sentences_from_spans(spans)

[docs]    def span_tokenize(self, s):
        """Return a list of integer offsets that identify sentences in the given text.

        :param string s: The text to tokenize into sentences.
        :rtype: iter(tuple(int, int))
        """
        if self._tokenizer is None:
            self._tokenizer = load_model(self.model)
        # for debug in tokenizer.debug_decisions(s):
        #     log.debug(format_debug_decision(debug))
        return self._tokenizer.span_tokenize(s)


[docs]class ChemSentenceTokenizer(SentenceTokenizer):
    """Sentence tokenizer that uses the Punkt algorithm by Kiss & Strunk (2006), trained on chemistry text."""
    model = 'models/punkt_chem-1.0.pickle'


[docs]class WordTokenizer(BaseTokenizer):
    """Standard word tokenizer for generic English text."""
    #: Split before and after these sequences, wherever they occur, unless entire token is one of these sequences
    SPLIT = [
        '----',
        '––––',  # \u2013 en dash
        '————',  # \u2014 em dash
        '<--->',
        '---',
        '–––',  # \u2013 en dash
        '———',  # \u2014 em dash
        '<-->',
        '-->',
        '...',
        '--',
        '––',  # \u2013 en dash
        '——',  # \u2014 em dash
        '``',
        "''",
        '->',
        '<',
        '>',
        '–',  # \u2013 en dash
        '—',  # \u2014 em dash
        '―',  # \u2015 horizontal bar
        '~',  # \u007e Tilde
        '⁓',  # \u2053 Swung dash
        '∼',  # \u223c Tilde operator
        '°',  # \u00b0 Degrees
        ';',
        '@',
        '#',
        '$',
        '£',  # \u00a3
        '€',  # \u20ac
        '%',
        '&',
        '?',
        '!',
        '™',  # \u2122
        '®',  # \u00ae
        '…',  # \u2026
        '⋯',  # \u22ef Mid-line ellipsis
        '†',  # \u2020 Dagger
        '‡',  # \u2021 Double dagger
        '§',  # \u00a7 Section sign
        '¶'   # \u00b6 Pilcrow sign
        '≠',  # \u2260
        '≡',  # \u2261
        '≢',  # \u2262
        '≣',  # \u2263
        '≤',  # \u2264
        '≥',  # \u2265
        '≦',  # \u2266
        '≧',  # \u2267
        '≨',  # \u2268
        '≩',  # \u2269
        '≪',  # \u226a
        '≫',  # \u226b
        '≈',  # \u2248
        '=',
        '÷',  # \u00f7
        '×',  # \u00d7
        '→',  # \u2192
        '⇄',  # \u21c4
        '"',  # \u0022 Quote mark
        '“',  # \u201c
        '”',  # \u201d
        '„',  # \u201e
        '‟',  # \u201f
        '‘',  # \u2018 Left single quote
        # '’',  # \u2019 Right single quote  - Regularly used as an apostrophe, so don't always split
        '‚',  # \u201a Single low quote
        '‛',  # \u201b Single reversed quote
        '`',  # \u0060
        '´',  # \u00b4
        # Primes
        '′',  # \u2032
        '″',  # \u2033
        '‴',  # \u2034
        '‵',  # \u2035
        '‶',  # \u2036
        '‷',  # \u2037
        '⁗',  # \u2057
        # Brackets
        '(',
        '[',
        '{',
        '}',
        ']',
        ')',
        # Slashes
        '/',  # \u002f Solidus
        '⁄',  # \u2044 Fraction slash
        '∕',  # \u2215 Division slash
        # Hyphens and Minuses
        # '-',  # \u002d Hyphen-minus
        '−',  # \u2212 Minus
        '‒',  # \u2012 figure dash
        # '‐',  # \u2010 Hyphen
        # '‐',  # \u2010 Hyphen
        # '‑',  # \u2011 Non-breaking hyphen
        '+',  # \u002b Plus
        '±',  # \u00b1 Plus/Minus
    ]
    #: Split around these sequences unless they are followed by a digit
    SPLIT_NO_DIGIT = [':', ',']
    #: Split after these sequences if they start a word
    SPLIT_START_WORD = ["''", "``", "'"]
    #: Split before these sequences if they end a word
    SPLIT_END_WORD = ["'s", "'m", "'d", "'ll", "'re", "'ve", "n't", "''", "'", "’s", "’m", "’d", "’ll", "’re", "’ve", "n’t", "’", "’’"]
    #: Don't split full stop off last token if it is one of these sequences
    NO_SPLIT_STOP = ['...', 'al.', 'Co.', 'Ltd.', 'Pvt.', 'A.D.', 'B.C.', 'B.V.', 'S.D.', 'U.K.', 'U.S.', 'r.t.']
    #: Split these contractions at the specified index
    CONTRACTIONS = [("cannot", 3), ("d'ye", 1), ("d’ye", 1), ("gimme", 3), ("gonna", 3), ("gotta", 3), ("lemme", 3),
                    ("mor'n", 3), ("mor’n", 3), ("wanna", 3), ("'tis", 2), ("'twas", 2)]
    #: Don't split these sequences.
    NO_SPLIT = {
        'mm-hm', 'mm-mm', 'o-kay', 'uh-huh', 'uh-oh', 'wanna-be'
    }
    #: Don't split around hyphens with these prefixes
    NO_SPLIT_PREFIX = {
        'e', 'a', 'u', 'x', 'agro', 'ante', 'anti', 'arch', 'be', 'bi', 'bio', 'co', 'counter', 'cross', 'cyber',
        'de', 'eco', 'ex', 'extra', 'inter', 'intra', 'macro', 'mega', 'micro', 'mid', 'mini', 'multi', 'neo', 'non',
        'over', 'pan', 'para', 'peri', 'post', 'pre', 'pro', 'pseudo', 'quasi', 're', 'semi', 'sub', 'super', 'tri',
        'ultra', 'un', 'uni', 'vice'
    }
    #: Don't split around hyphens with these suffixes.
    NO_SPLIT_SUFFIX = {
        'esque', 'ette', 'fest', 'fold', 'gate', 'itis', 'less', 'most', '-o-torium', 'rama', 'wise'
    }
    #: Don't split around hyphens if only these characters before or after.
    NO_SPLIT_CHARS = '0123456789,\'"“”„‟‘’‚‛`´′″‴‵‶‷⁗'

[docs]    def __init__(self, split_last_stop=True):
        #: Whether to split off the final full stop (unless preceded by NO_SPLIT_STOP). Default True.
        self.split_last_stop = split_last_stop

    def _split_span(self, span, index, length=0):
        """Split a span into two or three separate spans at certain indices."""
        offset = span[1] + index if index < 0 else span[0] + index
        # log.debug([(span[0], offset), (offset, offset + length), (offset + length, span[1])])
        return [(span[0], offset), (offset, offset + length), (offset + length, span[1])]

    def _subspan(self, s, span, nextspan, additional_regex):
        """Recursively subdivide spans based on a series of rules."""
        text = s[span[0]:span[1]]
        lowertext = text.lower()

        # Skip if only a single character or a split sequence
        if span[1] - span[0] < 2 or text in self.SPLIT or text in self.SPLIT_END_WORD or text in self.SPLIT_START_WORD or lowertext in self.NO_SPLIT:
            return [span]

        # Skip if it looks like URL
        if text.startswith('http://') or text.startswith('ftp://') or text.startswith('www.'):
            return [span]

        # Split full stop at end of final token (allow certain characters to follow) unless ellipsis
        if self.split_last_stop and nextspan is None and text not in self.NO_SPLIT_STOP and not text[-3:] == '...':
            if text[-1] == '.':
                return self._split_span(span, -1)
            ind = text.rfind('.')
            if ind > -1 and all(t in '\'‘’"“”)]}' for t in text[ind + 1:]):
                return self._split_span(span, ind, 1)

        # Split off certain sequences at the end of a word
        for spl in self.SPLIT_END_WORD:
            if text.endswith(spl) and len(text) > len(spl) and text[-len(spl) - 1].isalpha():
                return self._split_span(span, -len(spl), 0)

        # Split off certain sequences at the start of a word
        for spl in self.SPLIT_START_WORD:
            if text.startswith(spl) and len(text) > len(spl) and text[-len(spl) - 1].isalpha():
                return self._split_span(span, len(spl), 0)

        # Split around certain sequences
        for spl in self.SPLIT:
            ind = text.find(spl)
            if ind > -1:
                return self._split_span(span, ind, len(spl))

        # Split around certain sequences unless followed by a digit
        for spl in self.SPLIT_NO_DIGIT:
            ind = text.rfind(spl)
            if ind > -1 and (len(text) <= ind + len(spl) or not text[ind + len(spl)].isdigit()):
                return self._split_span(span, ind, len(spl))

        # Characters to split around, but with exceptions
        for i, char in enumerate(text):
            if char == '-':
                before = lowertext[:i]
                after = lowertext[i + 1:]
                # By default we split on hyphens
                split = True
                if before in self.NO_SPLIT_PREFIX or after in self.NO_SPLIT_SUFFIX:
                    split = False  # Don't split if prefix or suffix in list
                elif not before.strip(self.NO_SPLIT_CHARS) or not after.strip(self.NO_SPLIT_CHARS):
                    split = False  # Don't split if prefix or suffix entirely consist of certain characters
                if split:
                    return self._split_span(span, i, 1)

        # Split contraction words
        for contraction in self.CONTRACTIONS:
            if lowertext == contraction[0]:
                return self._split_span(span, contraction[1])

        additional_regex_handled = self.handle_additional_regex(s, span, nextspan, additional_regex)
        if additional_regex_handled is not None:
            return additional_regex_handled

        return [span]

[docs]    def get_word_tokens(self, sentence, additional_regex=None):
        if not additional_regex:
            additional_regex = self.get_additional_regex(sentence)
        return sentence._tokens_for_spans(self.span_tokenize(sentence.text, additional_regex))

[docs]    def get_additional_regex(self, sentence):
        """
        Any additional regex to further split the tokens. These regular expressions may be supplied by
        the sentence contexually and on the fly. For example, a sentence may have certain models
        associated with it and dimensions associated with these models. These dimensions can
        inform the tokenizer what to do with high confidence; for example, if given a string like
        "12K", then if a temperature is desired, then the tokenizer will automatically split this
        given the information provided.

        :param chemdataextractor.doc.text.Sentence sentence: The sentence for which to get additional regex
        :returns: Expression to further split the tokens
        :rtype: re.expression
        """
        return None

[docs]    def handle_additional_regex(self, s, span, nextspan, additional_regex):
        text = s[span[0]:span[1]]
        if additional_regex:
            for regex in additional_regex:
                split_text = regex.search(text)
                if split_text:
                    groups = split_text.groupdict()
                    for group_name, group in six.iteritems(groups):
                        if group is not None:
                            group_length = len(group)
                            if 'split' in group_name and group_length != 0:
                                return self._split_span(span, group_length, 0)
        return None

[docs]    def span_tokenize(self, s, additional_regex=None):
        """"""
        # First get spans by splitting on all whitespace
        # Includes: \u0020 \u00A0 \u1680 \u180E \u2000 \u2001 \u2002 \u2003 \u2004 \u2005 \u2006 \u2007 \u2008 \u2009 \u200A \u202F \u205F \u3000
        spans = [(left, right) for left, right in regex_span_tokenize(s, '\s+') if not left == right]
        i = 0
        # Recursively split spans according to rules
        while i < len(spans):
            subspans = self._subspan(s, spans[i], spans[i + 1] if i + 1 < len(spans) else None, additional_regex)
            spans[i:i + 1] = [subspan for subspan in subspans if subspan[1] - subspan[0] > 0]
            if len(subspans) == 1:
                i += 1
        return spans


[docs]class ChemWordTokenizer(WordTokenizer):
    """Word Tokenizer for chemistry text."""

    #: Split before and after these sequences, wherever they occur, unless entire token is one of these sequences
    SPLIT = [
        '----',
        '––––',  # \u2013 en dash
        '————',  # \u2014 em dash
        '<--->',
        '---',
        '–––',  # \u2013 en dash
        '———',  # \u2014 em dash
        '<-->',
        '-->',
        '...',
        '--',
        '––',  # \u2013 en dash
        '——',  # \u2014 em dash
        # '``',
        # "''",
        # '->',  # Don't split around this if occurs within chemical name
        '<',
        # '>',   # Don't split around this if occurs within chemical name
        ').',  # Fix missing whitespace errors
        '.(',  # Fix missing whitespace errors
        '–',  # \u2013 en dash
        '—',  # \u2014 em dash
        '―',  # \u2015 horizontal bar
        '~',  # \u007e Tilde
        '⁓',  # \u2053 Swung dash
        '∼',  # \u223c Tilde operator
        '°',  # \u00b0 Degrees
        # ';',
        '@',
        '#',
        '$',
        '£',  # \u00a3
        '€',  # \u20ac
        '%',
        '&',
        '?',
        '!',
        '™',  # \u2122
        '®',  # \u00ae
        '…',  # \u2026
        '⋯',  # \u22ef Mid-line ellipsis
        '†',  # \u2020 Dagger
        '‡',  # \u2021 Double dagger
        '§',  # \u00a7 Section sign
        '¶'   # \u00b6 Pilcrow sign
        '≠',  # \u2260
        '≡',  # \u2261
        '≢',  # \u2262
        '≣',  # \u2263
        '≤',  # \u2264
        '≥',  # \u2265
        '≦',  # \u2266
        '≧',  # \u2267
        '≨',  # \u2268
        '≩',  # \u2269
        '≪',  # \u226a
        '≫',  # \u226b
        '≈',  # \u2248
        '=',
        '÷',  # \u00f7
        '×',  # \u00d7
        # '→',  # \u2192 # Don't split around this if occurs within chemical name
        '⇄',  # \u21c4
        '"',  # \u0022 Quote mark
        '“',  # \u201c
        '”',  # \u201d
        '„',  # \u201e
        '‟',  # \u201f
        '‘',  # \u2018 Left single quote
        # '’',  # \u2019 Right single quote - Regularly used as an apostrophe, so don't always split
        '‚',  # \u201a Single low quote
        '‛',  # \u201b Single reversed quote
        '`',  # \u0060
        '´',  # \u00b4
        # Primes
        # '′',  # \u2032
        # '″',  # \u2033
        # '‴',  # \u2034
        # '‵',  # \u2035
        # '‶',  # \u2036
        # '‷',  # \u2037
        # '⁗',  # \u2057
        # Brackets
        # '(',
        # '[',
        # '{',
        # '}',
        # ']',
        # ')',
        # Slashes
        # '/',  # \u002f Solidus
        # '⁄',  # \u2044 Fraction slash
        # '∕',  # \u2215 Division slash
        # Hyphens and Minuses
        # '-',  # \u002d Hyphen-minus
        # '−',  # \u2212 Minus
        # '‒',  # \u2012 figure dash
        # '‐',  # \u2010 Hyphen
        # '‑',  # \u2011 Non-breaking hyphen
        # '+',  # \u002b Plus
        # '±',  # \u00b1 Plus/Minus
    ]
    #: Split before these sequences if they end a token
    SPLIT_END = [':', ',', '(TM)', '(R)', '(®)', '(™)', '(■)', '(◼)', '(●)', '(▲)', '(○)', '(◆)', '(▼)', '(⧫)', '(△)', '(◇)', '(▽)', '(⬚)', '(×)', '(□)', '(•)', '’', '°C']
    #: Split before these sequences if they end a token, unless preceded by a digit
    SPLIT_END_NO_DIGIT = ['(aq)', '(aq.)', '(s)', '(l)', '(g)']
    #: Don't split around slash when both preceded and followed by these characters
    NO_SPLIT_SLASH = ['+', '-', '−']
    #: Regular expression that matches a numeric quantity with units
    QUANTITY_RE = re.compile(r'^((?P<split>\d\d\d)g|(?P<_split1>[-−]?\d+\.\d+|10[-−]\d+)(g|s|m|N|V)([-−]?[1-4])?|(?P<_split2>\d*[-−]?\d+\.?\d*)([pnµμm]A|[µμmk]g|[kM]J|m[lL]|[nµμm]?M|[nµμmc]m|kN|[mk]V|[mkMG]?W|[mnpμµ]s|Hz|[Mm][Oo][Ll](e|ar)?s?|k?Pa|ppm|min)([-−]?[1-4])?)$')
    #: Don't split on hyphen if the prefix matches this regular expression
    NO_SPLIT_PREFIX_ENDING = re.compile('(^\(.*\)|^[\d,\'"“”„‟‘’‚‛`´′″‴‵‶‷⁗Α-Ωα-ω]+|ano|ato|azo|boc|bromo|cbz|chloro|eno|fluoro|fmoc|ido|ino|io|iodo|mercapto|nitro|ono|oso|oxalo|oxo|oxy|phospho|telluro|tms|yl|ylen|ylene|yliden|ylidene|ylidyn|ylidyne)$', re.U)
    #: Don't split on hyphen if prefix or suffix match this regular expression
    NO_SPLIT_CHEM = re.compile('([\-α-ω]|\d+,\d+|\d+[A-Z]|^d\d\d?$|acetic|acetyl|acid|acyl|anol|azo|benz|bromo|carb|cbz|chlor|cyclo|ethan|ethyl|fluoro|fmoc|gluc|hydro|idyl|indol|iene|ione|iodo|mercapto|n,n|nitro|noic|o,o|oxalo|oxo|oxy|oyl|onyl|phen|phth|phospho|pyrid|telluro|tetra|tms|ylen|yli|zole|alpha|beta|gamma|delta|epsilon|theta|kappa|lambda|sigma|omega)', re.U | re.I)
    #: Don't split on hyphen if the prefix is one of these sequences
    NO_SPLIT_PREFIX = {
        'e', 'a', 'u', 'x', 'agro', 'ante', 'anti', 'arch', 'be', 'bi', 'bio', 'co', 'counter', 'cross', 'cyber',
        'de', 'eco', 'ex', 'extra', 'inter', 'intra', 'macro', 'mega', 'micro', 'mid', 'mini', 'multi', 'neo', 'non',
        'over', 'pan', 'para', 'peri', 'post', 'pre', 'pro', 'pseudo', 'quasi', 're', 'semi', 'sub', 'super', 'tri',
        'ultra', 'un', 'uni', 'vice',

        'aci', 'adeno', 'aldehydo', 'allo', 'alpha', 'altro', 'ambi', 'aorto', 'arachno', 'as', 'beta', 'bis', 'catena',
        'centi', 'chi', 'chiro', 'circum', 'cis', 'closo', 'colo', 'conjuncto', 'conta', 'contra', 'cortico', 'cosa',
        'counter', 'cran', 'crypto', 'cyclo', 'deca', 'deci', 'delta', 'demi', 'di', 'dis', 'dl', 'eco', 'electro',
        'endo', 'ennea', 'ent', 'epi', 'epsilon', 'erythro', 'eta', 'exo', 'ferro', 'galacto', 'gamma', 'gastro',
        'giga', 'gluco', 'glycero', 'graft', 'gulo', 'hemi', 'hepta', 'hexa', 'homo', 'hydro', 'hypho', 'hypo', 'ideo',
        'idio', 'in', 'infra', 'iota', 'iso', 'judeo', 'kappa', 'keto', 'kis', 'lambda', 'lyxo', 'manno', 'medi',
        'meso', 'meta', 'milli', 'mono', 'mu', 'muco', 'musculo', 'myo', 'nano', 'neuro', 'nido', 'nitro', 'nona',
        'nor', 'novem', 'novi', 'nu', 'octa', 'octi', 'octo', 'omega', 'omicron', 'ortho', 'paleo', 'pelvi', 'penta',
        'pheno', 'phi', 'pi', 'pica', 'pneumo', 'poly', 'preter', 'psi', 'quadri', 'quater', 'quinque', 'recto', 'rho',
        'ribo', 'salpingo', 'scyllo', 'sec', 'sept', 'septi', 'sero', 'sesqui', 'sexi', 'sigma', 'sn', 'soci', 'supra',
        'sur', 'sym', 'syn', 'talo', 'tau', 'tele', 'ter', 'tera', 'tert', 'tetra', 'theta', 'threo', 'trans',
        'triangulo', 'tris', 'uber', 'unsym', 'upsilon', 'veno', 'ventriculo', 'xi', 'xylo', 'zeta',
    }
    #: Split on hyphens followed by one of these sequences
    SPLIT_SUFFIX = {
        'absorption', 'abstinent', 'abstraction', 'abuse', 'accelerated', 'accepting', 'acclimated', 'acclimation',
        'acid', 'activated', 'activation', 'active', 'activity', 'addition', 'adducted', 'adducts', 'adequate',
        'adjusted', 'administrated', 'adsorption', 'affected', 'aged', 'alcohol', 'alcoholic', 'algae', 'alginate',
        'alkaline', 'alkylated', 'alkylation', 'alkyne', 'analogous', 'anesthetized', 'appended', 'armed', 'aromatic',
        'assay', 'assemblages', 'assisted', 'associated', 'atom', 'atoms', 'attenuated', 'attributed', 'backbone',
        'base', 'based', 'bearing', 'benzylation', 'binding', 'biomolecule', 'biotic', 'blocking', 'blood', 'bond',
        'bonded', 'bonding', 'bonds', 'boosted', 'bottle', 'bottled', 'bound', 'bridge', 'bridged', 'buffer',
        'buffered', 'caged', 'cane', 'capped', 'capturing', 'carrier', 'carrying', 'catalysed', 'catalyzed', 'cation',
        'caused', 'centered', 'challenged', 'chelating', 'cleaving', 'coated', 'coating', 'coenzyme', 'competing',
        'competitive', 'complex', 'complexes', 'compound', 'compounds', 'concentration', 'conditioned', 'conditions',
        'conducting', 'configuration', 'confirmed', 'conjugate', 'conjugated', 'conjugates', 'connectivity',
        'consuming', 'contained', 'containing', 'contaminated', 'control', 'converting', 'coordinate', 'coordinated',
        'copolymer', 'copolymers', 'core', 'cored', 'cotransport', 'coupled', 'covered', 'crosslinked', 'cyclized',
        'damaged', 'dealkylation', 'decocted', 'decorated', 'deethylation', 'deficiency', 'deficient', 'defined',
        'degrading', 'demethylated', 'demethylation', 'dendrimer', 'density', 'dependant', 'dependence', 'dependent',
        'deplete', 'depleted', 'depleting', 'depletion', 'depolarization', 'depolarized', 'deprived', 'derivatised',
        'derivative', 'derivatives', 'derivatized', 'derived', 'desorption', 'detected', 'devalued', 'dextran',
        'dextrans', 'diabetic', 'dimensional', 'dimer', 'distribution', 'divalent', 'domain', 'dominated',
        'donating', 'donor', 'dopant', 'doped', 'doping', 'dosed', 'dot', 'drinking', 'driven', 'drug', 'drugs', 'dye',
        'edge', 'efficiency', 'electrodeposited', 'electrolyte', 'elevating', 'elicited', 'embedded', 'emersion',
        'emitting', 'encapsulated', 'encapsulating', 'enclosed', 'enhanced', 'enhancing', 'enriched', 'enrichment',
        'enzyme', 'epidermal', 'equivalents', 'etched', 'ethanolamine', 'evoked', 'exchange', 'excimer', 'excluder',
        'expanded', 'experimental', 'exposed', 'exposure', 'expressing', 'extract', 'extraction', 'fed', 'finger', 'fixed', 'fixing',
        'flanking', 'flavonoid', 'fluorescence', 'formation', 'forming', 'fortified', 'free', 'function',
        'functionalised', 'functionalized', 'functionalyzed', 'fused', 'gas', 'gated', 'generating', 'glucuronidating',
        'glycoprotein', 'glycosylated', 'glycosylation', 'gradient', 'grafted', 'group', 'groups', 'halogen',
        'heterocyclic', 'homologues', 'hydrogel', 'hydrolyzing', 'hydroxylated', 'hydroxylation', 'hydroxysteroid',
        'immersion', 'immobilized', 'immunoproteins', 'impregnated', 'imprinted', 'inactivated', 'increased',
        'increasing', 'incubated', 'independent', 'induce', 'induced', 'inducible', 'inducing', 'induction', 'influx',
        'inhibited', 'inhibitor', 'inhibitory', 'initiated', 'injected', 'insensitive', 'insulin', 'integrated',
        'interlinked', 'intermediate', 'intolerant', 'intoxicated', 'ion', 'ions', 'island', 'isomer', 'isomers',
        'knot', 'label', 'labeled', 'labeling', 'labelled', 'laden', 'lamp', 'laser', 'layer', 'layers', 'lesioned',
        'ligand', 'ligated', 'like', 'limitation', 'limited', 'limiting', 'lined', 'linked', 'linker', 'lipid',
        'lipids', 'lipoprotein', 'liposomal', 'liposomes', 'liquid', 'liver', 'loaded', 'loading', 'locked', 'loss',
        'lowering', 'lubricants', 'luminance', 'luminescence', 'maintained', 'majority', 'making', 'mannosylated',
        'material', 'mediated', 'metabolizing', 'metal', 'metallized', 'methylation', 'migrated', 'mimetic',
        'mimicking', 'mixed', 'mixture', 'mode', 'model', 'modified', 'modifying', 'modulated', 'moiety', 'molecule',
        'monoadducts', 'monomer', 'mutated', 'nanogel', 'nanoparticle', 'nanotube', 'need', 'negative', 'nitrosated',
        'nitrosation', 'nitrosylation', 'nmr', 'noncompetitive', 'normalized', 'nuclear', 'nucleoside', 'nucleosides',
        'nucleotide', 'nucleotides', 'nutrition', 'olefin', 'olefins', 'oligomers', 'omitted', 'only',
        'outcome', 'overload', 'oxidation', 'oxidized', 'oxo-mediated', 'oxygenation', 'page', 'paired', 'pathway',
        'patterned', 'peptide', 'permeabilized', 'permeable', 'phase', 'phospholipids', 'phosphopeptide',
        'phosphorylated', 'pillared', 'placebo', 'planted', 'plasma', 'polymer', 'polymers', 'poor', 'porous',
        'position', 'positive', 'postlabeling', 'precipitated', 'preferring', 'pretreated', 'primed', 'produced',
        'producing', 'production', 'promoted', 'promoting', 'protected', 'protein', 'proteomic', 'protonated',
        'provoked', 'purified', 'radical', 'reacting', 'reaction', 'reactive', 'reagents', 'rearranged', 'receptor',
        'receptors', 'recognition', 'redistribution', 'redox', 'reduced', 'reducing', 'reduction', 'refractory',
        'refreshed', 'regenerating', 'regulated', 'regulating', 'regulatory', 'related', 'release', 'releasing',
        'replete', 'requiring', 'resistance', 'resistant', 'resitant', 'response', 'responsive', 'responsiveness',
        'restricted', 'resulted', 'retinal', 'reversible', 'ribosylated', 'ribosylating', 'ribosylation', 'rich',
        'right', 'ring', 'saturated', 'scanning', 'scavengers', 'scavenging', 'sealed', 'secreting', 'secretion',
        'seeking', 'selective', 'selectivity', 'semiconductor', 'sensing', 'sensitive', 'sensitized', 'soluble',
        'solution', 'solvent', 'sparing', 'specific', 'spiked', 'stabilised', 'stabilized', 'stabilizing', 'stable',
        'stained', 'steroidal', 'stimulated', 'stimulating', 'storage', 'stressed', 'stripped', 'substituent',
        'substituted', 'substitution', 'substrate', 'sufficient', 'sugar', 'sugars', 'supplemented', 'supported',
        'suppressed', 'surface', 'susceptible', 'sweetened', 'synthesizing', 'tagged', 'target', 'telopeptide',
        'terminal', 'terminally', 'terminated', 'termini', 'terminus', 'ternary', 'terpolymer', 'tertiary', 'tested',
        'testes', 'tethered', 'tetrabrominated', 'tolerance', 'tolerant', 'toxicity', 'toxin', 'tracer', 'transfected',
        'transfer', 'transition', 'transport', 'transporter', 'treated', 'treating', 'treatment', 'triggered',
        'turn', 'type', 'unesterified', 'untreated', 'vacancies', 'vacancy', 'variable', 'water', 'yeast', 'yield',
        'zwitterion'
    }
    NO_SPLIT = {'°c'}

[docs]    def get_additional_regex(self, sentence):
        additional_regex = [self.QUANTITY_RE]
        quantity_re = sentence.quantity_re
        if quantity_re:
            additional_regex.append(quantity_re)
            # print('quantity re added')
        return additional_regex

    def _closing_bracket_index(self, text, bpair=('(', ')')):
        """Return the index of the closing bracket that matches the opening bracket at the start of the text."""
        level = 1
        for i, char in enumerate(text[1:]):
            if char == bpair[0]:
                level += 1
            elif char == bpair[1]:
                level -= 1
            if level == 0:
                return i + 1

    def _opening_bracket_index(self, text, bpair=('(', ')')):
        """Return the index of the opening bracket that matches the closing bracket at the end of the text."""
        level = 1
        for i, char in enumerate(reversed(text[:-1])):
            if char == bpair[1]:
                level += 1
            elif char == bpair[0]:
                level -= 1
            if level == 0:
                return len(text) - i - 2

    def _is_number(self, text):
        """Return True if the text is a number."""
        try:
            float(text)
            return True
        except ValueError:
            return False

    def _is_saccharide_arrow(self, before, after):
        """Return True if the arrow is in a chemical name."""
        if (before and after and before[-1].isdigit() and after[0].isdigit() and
            before.rstrip('0123456789').endswith('(') and after.lstrip('0123456789').startswith(')-')):
            return True
        else:
            return False

    def _subspan(self, s, span, nextspan, additional_regex):
        if additional_regex is None:
            additional_regex = [self.QUANTITY_RE]

        """Recursively subdivide spans based on a series of rules."""
        text = s[span[0]:span[1]]
        lowertext = text.lower()

        # Skip if only a single character or a split sequence
        if span[1] - span[0] < 2 or text in self.SPLIT or text in self.SPLIT_END_WORD or text in self.SPLIT_START_WORD or lowertext in self.NO_SPLIT:
            return [span]

        # Skip if it looks like URL
        if text.startswith('http://') or text.startswith('ftp://') or text.startswith('www.'):
            return [span]

        # Split full stop at end of final token (allow certain characters to follow) unless ellipsis
        if self.split_last_stop and nextspan is None and text not in self.NO_SPLIT_STOP and not text[-3:] == '...':
            if text[-1] == '.':
                return self._split_span(span, -1)
            ind = text.rfind('.')
            if ind > -1 and all(t in '\'‘’"“”)]}' for t in text[ind + 1:]):
                return self._split_span(span, ind, 1)

        # Split off certain sequences at the end of a token
        for spl in self.SPLIT_END:
            if text.endswith(spl) and len(text) > len(spl):
                return self._split_span(span, -len(spl), 0)

        # Split off certain sequences at the end of a word
        for spl in self.SPLIT_END_WORD:
            if text.endswith(spl) and len(text) > len(spl) and text[-len(spl) - 1].isalpha():
                return self._split_span(span, -len(spl), 0)

        # Split off certain sequences at the end of a word
        for spl in self.SPLIT_START_WORD:
            if text.startswith(spl) and len(text) > len(spl) and text[-len(spl) - 1].isalpha():
                return self._split_span(span, len(spl), 0)

        # Split around certain sequences
        for spl in self.SPLIT:
            ind = text.find(spl)
            if ind > -1:
                return self._split_span(span, ind, len(spl))

        # Split around certain sequences unless followed by a digit
        # - We skip this because of difficulty with chemical names.
        # for spl in self.SPLIT_NO_DIGIT:
        #     ind = text.rfind(spl)
        #     if ind > -1 and (len(text) <= ind + len(spl) or not text[ind + len(spl)].isdigit()):
        #         return self._split_span(span, ind, len(spl))

        # Split off certain sequences at the end of a token unless preceded by a digit
        for spl in self.SPLIT_END_NO_DIGIT:
            if text.endswith(spl) and len(text) > len(spl) and not text[-len(spl) - 1].isdigit():
                return self._split_span(span, -len(spl), 0)

        # Regular Bracket at both start and end, break off both provided they correspond
        if text.startswith('(') and text.endswith(')') and self._closing_bracket_index(text) == len(text) - 1:
            return self._split_span(span, 1, len(text)-2)

        # Split things like IR(KBr)
        if text.startswith('IR(') and text.endswith(')'):
            return self._split_span(span, 2, 1)

        # Split things like \d+\.\d+([a-z]+) e.g. UV-vis/IR peaks with bracketed strength/shape
        m = re.match('^(\d+\.\d+|\d{3,})(\([a-z]+\))$', text, re.I)
        if m:
            return self._split_span(span, m.start(2), 1)

        # Split brackets off start and end if the corresponding bracket isn't within token
        for bpair in [('(', ')'), ('{', '}'), ('[', ']')]:
            #level = bracket_level(text, open=[bpair[0]], close=[bpair[1]])
            # Bracket at start, bracketlevel > 0, break it off
            if text.startswith(bpair[0]) and self._closing_bracket_index(text, bpair=bpair) is None:
                return self._split_span(span, 1, 0)
            # Bracket at end, bracketlevel < 0, break it off
            if text.endswith(bpair[1]) and self._opening_bracket_index(text, bpair=bpair) is None:
                return self._split_span(span, -1, 0)

        # TODO: Consider splitting around comma in limited circumstances. Mainly to fix whitespace errors.

        # Characters to split around, but with exceptions
        for i, char in enumerate(text):
            before = text[:i]
            after = text[i+1:]
            if char in {':', ';'}:
                # Split around colon unless it looks like we're in a chemical name
                if not (before and after and after[0].isdigit() and before.rstrip('′\'')[-1:].isdigit() and '-' in after) and not (self.NO_SPLIT_CHEM.search(before) and self.NO_SPLIT_CHEM.search(after)):
                    return self._split_span(span, i, 1)
            elif char in {'x', '+', '−'}:
                # Split around x, +, − (\u2212 minus) between two numbers or at start followed by numbers
                if (i == 0 or self._is_number(before)) and self._is_number(after):
                    return self._split_span(span, i, 1)
                # Also plit around − (\u2212 minus) between two letters
                if char == '−' and before and before[-1].isalpha() and after and after[0].isalpha():
                    return self._split_span(span, i, 1)
            elif char == '±':
                # Split around ± unless surrounded by brackets
                if not (before and after and before[-1] == '(' and after[0] == ')'):
                    return self._split_span(span, i, 1)
            elif char == '/':
                # Split around / unless '+/-' or '-/-' etc.
                if not (before and after and before[-1] in self.NO_SPLIT_SLASH and after[0] in self.NO_SPLIT_SLASH):
                    return self._split_span(span, i, 1)
            elif char == '>':
                if not (before and before[-1] == '-'):
                    # Split if preceding is not -
                    return self._split_span(span, i, 1)
                if before and before[-1] == '-':
                    # If preceding is -, split around -> unless in chemical name
                    if not text == '->' and not self._is_saccharide_arrow(before[:-1], after):
                        return self._split_span(span, i-1, 2)
            elif char is '→' and not self._is_saccharide_arrow(before, after):
                # TODO: 'is' should be '=='... this never splits!?
                # Split around → unless in chemical name
                return self._split_span(span, i, 1)
            elif char == '(' and self._is_number(before) and not '(' in after and not ')' in after:
                # Split around open bracket after a number
                return self._split_span(span, i, 1)
            elif char == '-':
                lowerbefore = lowertext[:i]
                lowerafter = lowertext[i+1:]
                # Always split on -of-the- -to- -in- -by- -of- -or- -and- -per- -the-
                if lowerafter[:7] == 'of-the-':
                    return [(span[0], span[0] + i), (span[0] + i, span[0] + i + 1), (span[0] + i + 1, span[0] + i + 3), (span[0] + i + 3, span[0] + i + 4), (span[0] + i + 4, span[0] + i + 7), (span[0] + i + 7, span[0] + i + 8), (span[0] + i + 8, span[1])]
                if lowerafter[:5] in {'on-a-', 'of-a-'}:
                    return [(span[0], span[0] + i), (span[0] + i, span[0] + i + 1), (span[0] + i + 1, span[0] + i + 3), (span[0] + i + 3, span[0] + i + 4), (span[0] + i + 4, span[0] + i + 5), (span[0] + i + 5, span[0] + i + 6), (span[0] + i + 6, span[1])]
                if lowerafter[:3] in {'to-', 'in-', 'by-', 'of-', 'or-', 'on-'}:
                    return [(span[0], span[0] + i), (span[0] + i, span[0] + i + 1), (span[0] + i + 1, span[0] + i + 3), (span[0] + i + 3, span[0] + i + 4), (span[0] + i + 4, span[1])]
                if lowerafter[:4] in {'and-', 'per-', 'the-'}:
                    return [(span[0], span[0] + i), (span[0] + i, span[0] + i + 1), (span[0] + i + 1, span[0] + i + 4), (span[0] + i + 4, span[0] + i + 5), (span[0] + i + 5, span[1])]

                # By default we split on hyphens
                split = True
                if lowerafter == 'nmr':
                    split = True  # Always split NMR off end
                elif bracket_level(text) == 0 and (not bracket_level(after) == 0 or not bracket_level(before) == 0):
                    split = False  # Don't split if within brackets
                elif after and after[0] == '>':
                    split = False  # Don't split if followed by >
                elif lowerbefore in self.NO_SPLIT_PREFIX or lowerafter in self.NO_SPLIT_SUFFIX:
                    split = False  # Don't split if prefix or suffix in list
                elif self.NO_SPLIT_PREFIX_ENDING.search(lowerbefore):
                    split = False  # Don't split if prefix ends with pattern
                elif lowerafter in self.SPLIT_SUFFIX:
                    split = True  # Do split if suffix in list
                elif len(before) <= 1 or len(after) <= 2:
                    split = False  # Don't split if not at least 2 char before and 3 after
                elif self.NO_SPLIT_CHEM.search(lowerbefore) or self.NO_SPLIT_CHEM.search(lowerafter):
                    split = False  # Don't split if prefix or suffix match chem regex
                if split:
                    return self._split_span(span, i, 1)

                # TODO: Errors:
                # [³H]-choline
                # S,S'-...
                # 1,4-di-substituted
                # 11-β - hydroxysteroid
                # Spelt out greek: 11beta - hydroxysteroid
                # ...-N-substituted like 2,5-dimethyl-N-substituted pyrroles
                # 4-(2-Butyl-6,7-dichloro-2-cyclopentyl-indan-1-on-5-yl) oxobutyric acid
                # Adenosine - monophosphate
                # Consistency for amino acids: Arg-Arg and Arg-Arg-Asp... probably always split
                # D,L-α-peptide?
                # N'-formylkynurenine
                # poly(D,L-lactic acid )?
                # poly(methyl metha-acrylate )?
                # Poly(N - alkyl Acrylamide )
                # poly(N - isopropyl acrylamide )
                # R,S - lorazepam
                # S,S - dioxide

        # Split units off the end of a numeric value
        # quantity = self.QUANTITY_RE.search(text)
        # if quantity:
        #     return self._split_span(span, len(quantity.group(6) or quantity.group(3) or quantity.group(2)), 0)

        # Split pH off the start of a numeric value
        if text.startswith('pH') and self._is_number(text[2:]):
            return self._split_span(span, 2, 0)

        # Split contraction words
        for contraction in self.CONTRACTIONS:
            if lowertext == contraction[0]:
                return self._split_span(span, contraction[1])

        additional_regex_handled = self.handle_additional_regex(s, span, nextspan, additional_regex)
        if additional_regex_handled is not None:
            return additional_regex_handled

        if nextspan:
            nexttext = s[nextspan[0]:nextspan[1]]
            # Split NMR isotope whitespace errors (joined with previous sentence full stop)
            if nexttext == 'NMR':
                ind = text.rfind('.')
                if ind > -1 and text[ind + 1:] in {'1H', '13C', '15N', '31P', '19F', '11B', '29Si', '170', '73Ge', '195Pt', '33S', '13C{1H}'}:
                    return self._split_span(span, ind, 1)

        return [span]


[docs]class FineWordTokenizer(WordTokenizer):
    """Word Tokenizer that also split around hyphens and all colons."""
    #: Split before and after these sequences, wherever they occur, unless entire token is one of these sequences
    SPLIT = [
        '----',
        '––––',  # \u2013 en dash
        '————',  # \u2014 em dash
        '<--->',
        '---',
        '–––',  # \u2013 en dash
        '———',  # \u2014 em dash
        '<-->',
        '-->',
        '...',
        '--',
        '––',  # \u2013 en dash
        '——',  # \u2014 em dash
        '``',
        "''",
        '->',
        '<',
        '>',
        '–',  # \u2013 en dash
        '—',  # \u2014 em dash
        '―',  # \u2015 horizontal bar
        '~',  # \u007e Tilde
        '⁓',  # \u2053 Swung dash
        '∼',  # \u223c Tilde operator
        '°',  # \u00b0 Degrees
        ';',
        '@',
        '#',
        '$',
        '£',  # \u00a3
        '€',  # \u20ac
        '%',
        '&',
        '?',
        '!',
        '™',  # \u2122
        '®',  # \u00ae
        '…',  # \u2026
        '⋯',  # \u22ef Mid-line ellipsis
        '†',  # \u2020 Dagger
        '‡',  # \u2021 Double dagger
        '§',  # \u00a7 Section sign
        '¶'   # \u00b6 Pilcrow sign
        '≠',  # \u2260
        '≡',  # \u2261
        '≢',  # \u2262
        '≣',  # \u2263
        '≤',  # \u2264
        '≥',  # \u2265
        '≦',  # \u2266
        '≧',  # \u2267
        '≨',  # \u2268
        '≩',  # \u2269
        '≪',  # \u226a
        '≫',  # \u226b
        '≈',  # \u2248
        '=',
        '÷',  # \u00f7
        '×',  # \u00d7
        '→',  # \u2192
        '⇄',  # \u21c4
        '"',  # \u0022 Quote mark
        '“',  # \u201c
        '”',  # \u201d
        '„',  # \u201e
        '‟',  # \u201f
        '‘',  # \u2018 Left single quote
        '’',  # \u2019 Right single quote
        '‚',  # \u201a Single low quote
        '‛',  # \u201b Single reversed quote
        '`',  # \u0060
        '´',  # \u00b4
        # Primes
        '′',  # \u2032
        '″',  # \u2033
        '‴',  # \u2034
        '‵',  # \u2035
        '‶',  # \u2036
        '‷',  # \u2037
        '⁗',  # \u2057
        # Brackets
        '(',
        '[',
        '{',
        '}',
        ']',
        ')',
        # Slashes
        '/',  # \u002f Solidus
        '⁄',  # \u2044 Fraction slash
        '∕',  # \u2215 Division slash
        # Hyphens and Minuses
        '-',  # \u002d Hyphen-minus
        '−',  # \u2212 Minus
        '‒',  # \u2012 figure dash
        '‐',  # \u2010 Hyphen
        '‑',  # \u2011 Non-breaking hyphen
        '+',  # \u002b Plus
        '±',  # \u00b1 Plus/Minus
        ':',
    ]
    #: Split before these sequences if they end a token
    SPLIT_NO_DIGIT = [',']
    NO_SPLIT = {}
    #: Don't split around hyphens with these prefixes
    NO_SPLIT_PREFIX = {}
    #: Don't split around hyphens with these suffixes.
    NO_SPLIT_SUFFIX = {}

    def _subspan(self, s, span, nextspan, additional_regex):
        """Recursively subdivide spans based on a series of rules."""

        # Split on boundaries between greek and non-greek
        text = s[span[0]:span[1]]
        for i, char in enumerate(text):
            if i < len(text) - 1:
                nextchar = text[i + 1]
                if (char in GREEK and nextchar not in GREEK) or (char not in GREEK and nextchar in GREEK):
                    return [(span[0], span[0] + i + 1), (span[0] + i + 1, span[1])]

        # Perform all normal WordTokenizer splits
        return super(FineWordTokenizer, self)._subspan(s, span, nextspan, additional_regex)


[docs]class BertWordTokenizer(ChemWordTokenizer):
    """
    A word tokenizer for BERT with some additional allowances in case one wants to override its choices.
    Concrete overrides that are used in CDE include not splitting if it seems like a decimal point is in the
    middle of a number, and splitting values and units.
    """

    do_not_split = []
    do_not_split_if_in_num = [".", ","]

[docs]    def __init__(self, split_last_stop=True, path=None, lowercase=True):
        super().__init__(split_last_stop)
        if path is None:
            path = find_data('models/scibert_uncased_vocab-1.0.txt')
        self.tokenizer = BertWordPieceTokenizer(path, lowercase=lowercase)

[docs]    def span_tokenize(self, s, additional_regex=None):
        output = self.tokenizer.encode(str(s))
        offsets = output.offsets[1: -1]
        given_tokens = output.tokens[1: -1]
        current_span = (0, 0)
        spans = []
        i = 0
        zipped = [el for el in zip(offsets, given_tokens)]

        while i < len(zipped):
            offset, token = zipped[i]

            # If symbol is in do_not_split and it's part of a word, i.e. it's not surrounded
            # by whitespace, then don't split it
            if (s[offset[0]: offset[1]] in self.do_not_split_if_in_num and offset[0] == current_span[1]
               and i < len(zipped) - 1 and zipped[i + 1][0][0] == offset[1]
               and re.match("\d+$", s[zipped[i + 1][0][0]: zipped[i + 1][0][1]])):
                i += 1
                offset, token = zipped[i]
                current_span = (current_span[0], offset[1])
            # If symbol is in do_not_split and it's part of a word, i.e. it's not surrounded
            # by whitespace, then don't split it
            elif (s[offset[0]: offset[1]] in self.do_not_split and offset[0] == current_span[1]
               and i < len(zipped) - 1 and zipped[i + 1][0][0] == offset[1]):
                i += 1
                offset, token = zipped[i]
                current_span = (current_span[0], offset[1])
            # Prevent splitting of negative numbers, but allow splitting of ranges such as 0.5-1.0
            # and cases like 5-Bromo-6-Penda...
            elif (s[offset[0]: offset[1]] == "-"
               and i < len(zipped) - 1 and zipped[i + 1][0][0] == offset[1]
               and re.match(r"\d+$", s[zipped[i + 1][0][0]: zipped[i + 1][0][1]])
               and (i == 0
                    or not (zipped[i - 1][0][1] == offset[0]
                        and re.match(r"\d+$", s[zipped[i - 1][0][0]: zipped[i - 1][0][1]])))
               and (i >= len(zipped) - 2
                    or not (zipped[i + 2][0][0] == zipped[i + 1][0][1]
                        and s[zipped[i + 2][0][0]: zipped[i + 2][0][1]] == "-"))):
                i += 1
                if current_span != (0, 0):
                    spans.append(current_span)
                current_span = offset
                offset, token = zipped[i]
                current_span = (current_span[0], offset[1])
            # If the token is a subword, as defined by BERT, then merge it with the previous token
            elif len(token) > 2 and token[:2] == "##":
                current_span = (current_span[0], offset[1])
            # Otherwise, split it
            else:
                if current_span != (0, 0):
                    spans.append(current_span)
                current_span = offset
            i += 1

        spans.append(current_span)

        # Perform additional tokenisation as required by the additional regex
        if additional_regex is not None:
            i = 0
            while i < len(spans):
                subspans = self.handle_additional_regex(s, spans[i], spans[i + 1] if i + 1 < len(spans) else None, additional_regex)
                if subspans is None:
                    subspans = [spans[i]]
                spans[i:i + 1] = [subspan for subspan in subspans if subspan[1] - subspan[0] > 0]
                if len(subspans) == 1:
                    i += 1

        return spans