Source code for chemdataextractor.nlp.abbrev

# -*- coding: utf-8 -*-
"""
Abbreviation detection.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import re

from ..text import bracket_level


log = logging.getLogger(__name__)


[docs]class AbbreviationDetector(object): """Detect abbreviation definitions in a list of tokens. Similar to the algorithm in Schwartz & Hearst 2003. """ # TODO: Extend to Greek characters (custom method instead of .isalnum()) #: Minimum abbreviation length abbr_min = 3 #: Maximum abbreviation length abbr_max = 10 #: String equivalents to use when detecting abbreviations. abbr_equivs = []
[docs] def __init__(self, abbr_min=None, abbr_max=None, abbr_equivs=None): self.abbr_min = abbr_min if abbr_min is not None else self.abbr_min self.abbr_max = abbr_max if abbr_max is not None else self.abbr_max self.abbr_equivs = abbr_equivs if abbr_equivs is not None else self.abbr_equivs
def _is_allowed_abbr(self, tokens): """Return True if text is an allowed abbreviation.""" num_hyph = tokens.count("-") # Abbreviations should contain at most 2 tokens; number tokens minus number of hyphens minus the number of tokens due to splitting on hyphens if len(tokens) - 2 * num_hyph <= 2: abbr_text = ''.join(tokens) if self.abbr_min <= len(abbr_text) - num_hyph <= self.abbr_max and bracket_level(abbr_text) == 0: # Check the number of characters in abbrev_text and if it contains balanced brackets or no brackets if abbr_text[0].isalnum() and any(c.isalpha() for c in abbr_text): # Disallow property values if re.match('^\d+(\.\d+)?(g|m[lL]|cm)$', abbr_text): # int or float followed by "q" or "ml" or "cm" # TODO: generalize to any units return False return True return False def _max_long_length(self, abbr): """Set upper limit to the number of tokens in full name.""" abbr_len = len(''.join(abbr)) return min(abbr_len + 5, abbr_len * 2) def _get_candidates(self, tokens): """Find valid pairs of full names and abbreviations.""" candidates = [] bracket_spans = [] for i, t1 in enumerate(tokens): # Find potential abbreviation and full names in the form short_name=long_name if t1 == '=': # abbr = long abbr_span = (i-1, i) # abbreviation of 1 tokens is allowed only. abbr = tokens[abbr_span[0]:abbr_span[1]] if i > 0 and self._is_allowed_abbr(abbr): long_span = self._get_long_span(tokens, abbr_span, start=i+1) #long = self._get_long(abbr, tokens[i+1:], fix_left=True) if long_span: candidates.append((abbr_span, long_span)) #candidates.append((abbr, long)) # Find potential abbreviation and full names in the form long_name (short_name) if t1 == '(': for j, t2 in enumerate(tokens[i+1:]): if t2 in {')', ';', ','}: bracket_spans.append((i+1, i+j+1)) break for span in bracket_spans: inside = tokens[span[0]:span[1]] if self._is_allowed_abbr(inside): # long ( abbr ) or ; or , #long = self._get_long(inside, tokens[:span[0]-1], fix_right=True) long_span = self._get_long_span(tokens, span, end=span[0]-1) if long_span: candidates.append((span, long_span)) #candidates.append((inside, long)) elif tokens[span[1]] == ')': if span[0] - 1 > 0 and self._is_allowed_abbr([tokens[span[0]-2]]): # abbr ( long ) #abbr = [tokens[span[0]-2]] abbr_span = (span[0]-2, span[0]-1) #long = self._get_long(abbr, inside, fix_left=True, fix_right=True) long_span = self._get_long_span(tokens, abbr_span, start=span[0], end=span[1]) if long_span: candidates.append((abbr_span, long_span)) elif tokens[span[1]] == ',': for j, t2 in enumerate(tokens[span[1]+2:span[1]+4]): if t2 == ')': # ( long , abbr ) #abbr = tokens[span[1]+1:span[1]+2+j] abbr_span = (span[1]+1, span[1]+2+j) #long = self._get_long(abbr, inside, fix_left=True, fix_right=True) long_span = self._get_long_span(tokens, abbr_span, start=span[0], end=span[1]) if long_span: candidates.append((abbr_span, long_span)) break return candidates def _get_long_span(self, tokens, abbr_span, start=None, end=None): """""" abbr = tokens[abbr_span[0]:abbr_span[1]] #print(abbr) # Get the maximum allowed number of tokens max_length = self._max_long_length(abbr) #print(max_length) if start is not None and end is not None: if end - start <= max_length and self._is_valid_long(abbr, tokens[start:end]): return start, end elif start is None and end is not None: # Expand long backwards from end i = 1 while True: long_tokens = tokens[end - i:end] num_hyph = long_tokens.count("-") if len(long_tokens) - 2 * num_hyph > max_length: # ab-cd-ef should be counted as 1 token, number tokens minus number of hyphens minus the number of tokens due to splitting on hyphens return None else: if self._is_valid_long(abbr, long_tokens): return (end - i, end) i += 1 if i > end: return None elif start is not None and end is None: # Expand long forwards from start i = 1 while True: long_tokens = tokens[start:start + i] num_hyph = long_tokens.count("-") # number tokens minus number of hyphens minus the number of tokens due to splitting on hyphens if len(long_tokens) - 2 * num_hyph > max_length: return None else: if self._is_valid_long(abbr, long_tokens): return (start, start + i) i += 1 if start + i == len(tokens): return None def _is_valid_long(self, abbr, tokens): """Return True if a span of tokens is a valid long name""" def _is_valid(abbr, long): # Disallowed characters - @ typically in emails if '@' in long: return False l_i = len(long) - 1 for a_i in range(len(abbr) - 1, -1, -1): current = abbr[a_i].lower() #print('current: %s' % current) # Ignore non-alphanumeric # TODO: Greek! if not current.isalnum(): continue while (l_i >= 0 and not long[l_i].lower() == current) or (a_i == 0 and l_i > 0 and long[l_i-1].isalnum()): # The letters in an abbreviation should appear in the long name # in the same order as in the abbreviation #print('L: %s' % long[l_i]) l_i -= 1 if l_i < 0: #print('l_i < 0 : fail') return False l_i -= 1 return True abbr = ''.join(abbr) # long = ' '.join(tokens) spaced_tokens = [] for i, token in enumerate(tokens): if token != "-": spaced_tokens += [token, " "] else: if i != 0: if spaced_tokens[-1] == " ": spaced_tokens.pop(-1) spaced_tokens.append("-") longs = {"".join(spaced_tokens)} for before, after in self.abbr_equivs: newlongs = set() for long in longs: newlongs.add(long.replace(before, after)) longs.update(newlongs) for long in longs: if _is_valid(abbr, long): return True return False def _filter_candidates(self, tokens, candidates): """Discard if long shorter than abbr, or if abbr token(s) are in the long token(s).""" results = [] for abbr_span, long_span in candidates: abbr = tokens[abbr_span[0]:abbr_span[1]] long = tokens[long_span[0]:long_span[1]] if not all(a in long for a in abbr) and len(''.join(long)) > len(''.join(abbr)): results.append((abbr_span, long_span)) return results
[docs] def detect(self, tokens): """Return a (abbr, long) pair for each abbreviation definition.""" results = [] for abbr_span, long_span in self.detect_spans(tokens): results.append((tokens[abbr_span[0]:abbr_span[1]], tokens[long_span[0]:long_span[1]])) return results
[docs] def detect_spans(self, tokens): """Return (abbr_span, long_span) pair for each abbreviation definition. abbr_span and long_span are (int, int) spans defining token ranges. """ candidates = self._get_candidates(tokens) results = self._filter_candidates(tokens, candidates) return results
[docs]class ChemAbbreviationDetector(AbbreviationDetector): """Chemistry-aware abbreviation detector. This abbreviation detector has an additional list of string equivalents (e.g. Silver = Ag) that improve abbreviation detection on chemistry texts. """ #: Minimum abbreviation length abbr_min = 3 #: Maximum abbreviation length, was 10 in Hearst & Schwartz for biological terminologies. # The Schwartz paper's test phrases are mainly "Protein data bank (PDB)". # The long name is a span of every day English words # that forms a terminology, where most tokens contribute only one character to the short name. # But chemical names have different convention e.g. Acetyl = Ac # a simple prefix leads to 2-4 characters in the short name. # As a result, a complex molecule containing 3-4 moieties will have a short name overshooting 10 characters. # Now 14 in ChembbreviationDetector. abbr_max = 14 #: String equivalents to use when detecting abbreviations. abbr_equivs = [ ('silver', 'Ag'), ('gold', 'Au'), ('mercury', 'Hg'), ('lead', 'Pb'), ('tin', 'Sn'), ('tungsten', 'W'), ('iron', 'Fe'), ('sodium', 'Na'), ('potassium', 'K'), ('copper', 'Cu'), ('sulfate', 'SO4'), ('methanol', 'MeOH'), ('ethanol', 'EtOH'), ('hydroxy', 'OH'), ('hexadecyltrimethylammonium bromide', 'CTAB'), ('cytarabine', 'Ara-C'), ('hydroxylated', 'OH'), ('hydrogen peroxide', 'H2O2'), ('quartz', 'SiO2'), ('amino', 'NH2'), ('amino', 'NH2'), ('ammonia', 'NH3'), ('ammonium', 'NH4'), ('methyl', 'CH3'), ('nitro', 'NO2'), ('potassium carbonate', 'K2CO3'), ('carbonate', 'CO3'), ('borohydride', 'BH4'), ('triethylamine', 'NEt3'), ('triethylamine', 'Et3N'), ]