# -*- coding: utf-8 -*-
"""
Abbreviation detection.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import re
from ..text import bracket_level
log = logging.getLogger(__name__)
[docs]class AbbreviationDetector(object):
"""Detect abbreviation definitions in a list of tokens.
Similar to the algorithm in Schwartz & Hearst 2003.
"""
# TODO: Extend to Greek characters (custom method instead of .isalnum())
#: Minimum abbreviation length
abbr_min = 3
#: Maximum abbreviation length
abbr_max = 10
#: String equivalents to use when detecting abbreviations.
abbr_equivs = []
[docs] def __init__(self, abbr_min=None, abbr_max=None, abbr_equivs=None):
self.abbr_min = abbr_min if abbr_min is not None else self.abbr_min
self.abbr_max = abbr_max if abbr_max is not None else self.abbr_max
self.abbr_equivs = abbr_equivs if abbr_equivs is not None else self.abbr_equivs
def _is_allowed_abbr(self, tokens):
"""Return True if text is an allowed abbreviation."""
num_hyph = tokens.count("-")
# Abbreviations should contain at most 2 tokens; number tokens minus number of hyphens minus the number of tokens due to splitting on hyphens
if len(tokens) - 2 * num_hyph <= 2:
abbr_text = ''.join(tokens)
if self.abbr_min <= len(abbr_text) - num_hyph <= self.abbr_max and bracket_level(abbr_text) == 0:
# Check the number of characters in abbrev_text and if it contains balanced brackets or no brackets
if abbr_text[0].isalnum() and any(c.isalpha() for c in abbr_text):
# Disallow property values
if re.match('^\d+(\.\d+)?(g|m[lL]|cm)$', abbr_text):
# int or float followed by "q" or "ml" or "cm"
# TODO: generalize to any units
return False
return True
return False
def _max_long_length(self, abbr):
"""Set upper limit to the number of tokens in full name."""
abbr_len = len(''.join(abbr))
return min(abbr_len + 5, abbr_len * 2)
def _get_candidates(self, tokens):
"""Find valid pairs of full names and abbreviations."""
candidates = []
bracket_spans = []
for i, t1 in enumerate(tokens):
# Find potential abbreviation and full names in the form short_name=long_name
if t1 == '=':
# abbr = long
abbr_span = (i-1, i) # abbreviation of 1 tokens is allowed only.
abbr = tokens[abbr_span[0]:abbr_span[1]]
if i > 0 and self._is_allowed_abbr(abbr):
long_span = self._get_long_span(tokens, abbr_span, start=i+1)
#long = self._get_long(abbr, tokens[i+1:], fix_left=True)
if long_span:
candidates.append((abbr_span, long_span))
#candidates.append((abbr, long))
# Find potential abbreviation and full names in the form long_name (short_name)
if t1 == '(':
for j, t2 in enumerate(tokens[i+1:]):
if t2 in {')', ';', ','}:
bracket_spans.append((i+1, i+j+1))
break
for span in bracket_spans:
inside = tokens[span[0]:span[1]]
if self._is_allowed_abbr(inside):
# long ( abbr ) or ; or ,
#long = self._get_long(inside, tokens[:span[0]-1], fix_right=True)
long_span = self._get_long_span(tokens, span, end=span[0]-1)
if long_span:
candidates.append((span, long_span))
#candidates.append((inside, long))
elif tokens[span[1]] == ')':
if span[0] - 1 > 0 and self._is_allowed_abbr([tokens[span[0]-2]]):
# abbr ( long )
#abbr = [tokens[span[0]-2]]
abbr_span = (span[0]-2, span[0]-1)
#long = self._get_long(abbr, inside, fix_left=True, fix_right=True)
long_span = self._get_long_span(tokens, abbr_span, start=span[0], end=span[1])
if long_span:
candidates.append((abbr_span, long_span))
elif tokens[span[1]] == ',':
for j, t2 in enumerate(tokens[span[1]+2:span[1]+4]):
if t2 == ')':
# ( long , abbr )
#abbr = tokens[span[1]+1:span[1]+2+j]
abbr_span = (span[1]+1, span[1]+2+j)
#long = self._get_long(abbr, inside, fix_left=True, fix_right=True)
long_span = self._get_long_span(tokens, abbr_span, start=span[0], end=span[1])
if long_span:
candidates.append((abbr_span, long_span))
break
return candidates
def _get_long_span(self, tokens, abbr_span, start=None, end=None):
""""""
abbr = tokens[abbr_span[0]:abbr_span[1]]
#print(abbr)
# Get the maximum allowed number of tokens
max_length = self._max_long_length(abbr)
#print(max_length)
if start is not None and end is not None:
if end - start <= max_length and self._is_valid_long(abbr, tokens[start:end]):
return start, end
elif start is None and end is not None:
# Expand long backwards from end
i = 1
while True:
long_tokens = tokens[end - i:end]
num_hyph = long_tokens.count("-")
if len(long_tokens) - 2 * num_hyph > max_length:
# ab-cd-ef should be counted as 1 token, number tokens minus number of hyphens minus the number of tokens due to splitting on hyphens
return None
else:
if self._is_valid_long(abbr, long_tokens):
return (end - i, end)
i += 1
if i > end:
return None
elif start is not None and end is None:
# Expand long forwards from start
i = 1
while True:
long_tokens = tokens[start:start + i]
num_hyph = long_tokens.count("-")
# number tokens minus number of hyphens minus the number of tokens due to splitting on hyphens
if len(long_tokens) - 2 * num_hyph > max_length:
return None
else:
if self._is_valid_long(abbr, long_tokens):
return (start, start + i)
i += 1
if start + i == len(tokens):
return None
def _is_valid_long(self, abbr, tokens):
"""Return True if a span of tokens is a valid long name"""
def _is_valid(abbr, long):
# Disallowed characters - @ typically in emails
if '@' in long:
return False
l_i = len(long) - 1
for a_i in range(len(abbr) - 1, -1, -1):
current = abbr[a_i].lower()
#print('current: %s' % current)
# Ignore non-alphanumeric # TODO: Greek!
if not current.isalnum():
continue
while (l_i >= 0 and not long[l_i].lower() == current) or (a_i == 0 and l_i > 0 and long[l_i-1].isalnum()):
# The letters in an abbreviation should appear in the long name
# in the same order as in the abbreviation
#print('L: %s' % long[l_i])
l_i -= 1
if l_i < 0:
#print('l_i < 0 : fail')
return False
l_i -= 1
return True
abbr = ''.join(abbr)
# long = ' '.join(tokens)
spaced_tokens = []
for i, token in enumerate(tokens):
if token != "-":
spaced_tokens += [token, " "]
else:
if i != 0:
if spaced_tokens[-1] == " ":
spaced_tokens.pop(-1)
spaced_tokens.append("-")
longs = {"".join(spaced_tokens)}
for before, after in self.abbr_equivs:
newlongs = set()
for long in longs:
newlongs.add(long.replace(before, after))
longs.update(newlongs)
for long in longs:
if _is_valid(abbr, long):
return True
return False
def _filter_candidates(self, tokens, candidates):
"""Discard if long shorter than abbr, or if abbr token(s) are in the long token(s)."""
results = []
for abbr_span, long_span in candidates:
abbr = tokens[abbr_span[0]:abbr_span[1]]
long = tokens[long_span[0]:long_span[1]]
if not all(a in long for a in abbr) and len(''.join(long)) > len(''.join(abbr)):
results.append((abbr_span, long_span))
return results
[docs] def detect(self, tokens):
"""Return a (abbr, long) pair for each abbreviation definition."""
results = []
for abbr_span, long_span in self.detect_spans(tokens):
results.append((tokens[abbr_span[0]:abbr_span[1]], tokens[long_span[0]:long_span[1]]))
return results
[docs] def detect_spans(self, tokens):
"""Return (abbr_span, long_span) pair for each abbreviation definition.
abbr_span and long_span are (int, int) spans defining token ranges.
"""
candidates = self._get_candidates(tokens)
results = self._filter_candidates(tokens, candidates)
return results
[docs]class ChemAbbreviationDetector(AbbreviationDetector):
"""Chemistry-aware abbreviation detector.
This abbreviation detector has an additional list of string equivalents (e.g. Silver = Ag) that improve abbreviation
detection on chemistry texts.
"""
#: Minimum abbreviation length
abbr_min = 3
#: Maximum abbreviation length, was 10 in Hearst & Schwartz for biological terminologies.
# The Schwartz paper's test phrases are mainly "Protein data bank (PDB)".
# The long name is a span of every day English words
# that forms a terminology, where most tokens contribute only one character to the short name.
# But chemical names have different convention e.g. Acetyl = Ac
# a simple prefix leads to 2-4 characters in the short name.
# As a result, a complex molecule containing 3-4 moieties will have a short name overshooting 10 characters.
# Now 14 in ChembbreviationDetector.
abbr_max = 14
#: String equivalents to use when detecting abbreviations.
abbr_equivs = [
('silver', 'Ag'),
('gold', 'Au'),
('mercury', 'Hg'),
('lead', 'Pb'),
('tin', 'Sn'),
('tungsten', 'W'),
('iron', 'Fe'),
('sodium', 'Na'),
('potassium', 'K'),
('copper', 'Cu'),
('sulfate', 'SO4'),
('methanol', 'MeOH'),
('ethanol', 'EtOH'),
('hydroxy', 'OH'),
('hexadecyltrimethylammonium bromide', 'CTAB'),
('cytarabine', 'Ara-C'),
('hydroxylated', 'OH'),
('hydrogen peroxide', 'H2O2'),
('quartz', 'SiO2'),
('amino', 'NH2'),
('amino', 'NH2'),
('ammonia', 'NH3'),
('ammonium', 'NH4'),
('methyl', 'CH3'),
('nitro', 'NO2'),
('potassium carbonate', 'K2CO3'),
('carbonate', 'CO3'),
('borohydride', 'BH4'),
('triethylamine', 'NEt3'),
('triethylamine', 'Et3N'),
]