# -*- coding: utf-8 -*-
"""
Part-of-speech tagging.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
from .lexicon import ChemLexicon
from .tag import ApTagger, CrfTagger, POS_TAG_TYPE
log = logging.getLogger(__name__)
#: Complete set of POS tags. Ordered by decreasing frequency in WSJ corpus.
TAGS = [
'NN', # NN : 174028
'IN', # IN : 132241
'NNP', # NNP : 115653
'DT', # DT : 101067
'NNS', # NNS : 74257
'JJ', # JJ : 71238
',', # , : 60488
'.', # . : 48689
'CD', # CD : 47449
'RB', # RB : 40004
'VBD', # VBD : 37236
'VB', # VB : 32781
'CC', # CC : 29607
'VBN', # VBN : 26807
'VBZ', # VBZ : 26335
'PRP', # PRP : 21368
'VBG', # VBG : 18693
'TO', # TO : 16252
'VBP', # VBP : 15370
'HYPH', # HYPH : 14789
'MD', # MD : 12010
'POS', # POS : 10844
'PRP$', # PRP$ : 10252
'$', # $ : 9217
'``', # `` : 8879
'\'\'', # '' : 8649
':', # : : 6074
'WDT', # WDT : 5824
'JJR', # JJR : 4370
'RP', # RP : 3509
'NNPS', # NNPS : 3186
'WP', # WP : 2885
'WRB', # WRB : 2629
'RBR', # RBR : 2189
'JJS', # JJS : 2129
'-RRB-', # -RRB- : 1689
'-LRB-', # -LRB- : 1672
'EX', # EX : 1094
'RBS', # RBS : 946
'PDT', # PDT : 504
'SYM', # SYM : 379
'FW', # FW : 279
'WP$', # WP$ : 219
'UH', # UH : 127
'LS', # LS : 102
'NFP', # NFP : 14
'AFX', # AFX : 4
]
[docs]class ApPosTagger(ApTagger):
"""Greedy Averaged Perceptron POS tagger trained on WSJ corpus.
"""
model = 'models/pos_ap_wsj_nocluster-1.0.pickle'
tag_type = POS_TAG_TYPE
clusters = False
def _get_features(self, i, context, prev, prev2):
"""Map tokens into a feature representation."""
w = self.lexicon[context[i]]
features = [
'bias',
'w:shape=%s' % w.shape,
'w:lower=%s' % w.lower,
'p1:tag=%s' % prev,
'p2:tag=%s' % prev2,
'p1:tag+w:lower=%s+%s' % (prev, w.lower),
'p1:tag+p2:tag=%s+%s' % (prev, prev2),
]
if w.like_number:
features.append('w:like_number')
elif w.is_punct:
features.append('w:is_punct')
elif w.like_url:
features.append('w:like_url')
else:
features.extend([
'w:suffix2=%s' % w.lower[-2:],
'w:suffix3=%s' % w.lower[-3:],
'w:suffix4=%s' % w.lower[-4:],
'w:suffix5=%s' % w.lower[-5:],
'w:prefix1=%s' % w.lower[:1],
'w:prefix2=%s' % w.lower[:2],
'w:prefix3=%s' % w.lower[:3],
])
if w.is_alpha:
features.append('w:is_alpha')
elif w.is_hyphenated:
features.append('w:is_hyphenated')
if w.is_upper:
features.append('w:is_upper')
elif w.is_lower:
features.append('w:is_lower')
elif w.is_title:
features.append('w:is_title')
if self.clusters and w.cluster:
features.extend([
'w:cluster4=%s' % w.cluster[:4],
'w:cluster6=%s' % w.cluster[:6],
'w:cluster10=%s' % w.cluster[:10],
'w:cluster20=%s' % w.cluster[:20],
])
# Add features for previous tokens if present
if i > 0:
p1 = self.lexicon[context[i-1]]
features.extend([
'p1:lower=%s' % p1.lower,
'p1:shape=%s' % p1.shape,
])
if not (p1.like_number or p1.is_punct or p1.like_url):
features.append('p1:suffix3=%s' % p1.lower[-3:])
if self.clusters and p1.cluster:
features.extend([
'p1:cluster4=%s' % p1.cluster[:4],
'p1:cluster6=%s' % p1.cluster[:6],
'p1:cluster10=%s' % p1.cluster[:10],
'p1:cluster20=%s' % p1.cluster[:20],
])
if i > 1:
p2 = self.lexicon[context[i-2]]
features.extend([
'p2:lower=%s' % p2.lower,
'p2:shape=%s' % p2.shape,
])
if self.clusters and p2.cluster:
features.extend([
'p2:cluster4=%s' % p2.cluster[:4],
'p2:cluster6=%s' % p2.cluster[:6],
'p2:cluster10=%s' % p2.cluster[:10],
'p2:cluster20=%s' % p2.cluster[:20],
])
# Add features for next tokens if present
end = len(context) - 1
if i < end:
n1 = self.lexicon[context[i+1]]
features.extend([
'n1:lower=%s' % n1.lower,
'n1:shape=%s' % n1.shape
])
if not (n1.like_number or n1.is_punct or n1.like_url):
features.append('n1:suffix3=%s' % n1.lower[-3:])
if self.clusters and n1.cluster:
features.extend([
'n1:cluster4=%s' % n1.cluster[:4],
'n1:cluster6=%s' % n1.cluster[:6],
'n1:cluster10=%s' % n1.cluster[:10],
'n1:cluster20=%s' % n1.cluster[:20],
])
if i < end - 1:
n2 = self.lexicon[context[i+2]]
features.extend([
'n2:lower=%s' % n2.lower,
'n2:shape=%s' % n2.shape
])
if self.clusters and n2.cluster:
features.extend([
'n2:cluster4=%s' % n2.cluster[:4],
'n2:cluster6=%s' % n2.cluster[:6],
'n2:cluster10=%s' % n2.cluster[:10],
'n2:cluster20=%s' % n2.cluster[:20],
])
# Add position features
if i == 0:
features.append('-firsttoken-')
elif i == 1:
features.append('-secondtoken-')
elif i == end - 1:
features.append('-secondlasttoken-')
elif i == end:
features.append('-lasttoken-')
return features
[docs]class ChemApPosTagger(ApPosTagger):
"""Greedy Averaged Perceptron POS tagger trained on both WSJ and GENIA corpora.
Uses features based on word clusters from chemistry text.
"""
model = 'models/pos_ap_wsj_genia-1.0.pickle'
lexicon = ChemLexicon()
tag_type = POS_TAG_TYPE
clusters = True
[docs]class CrfPosTagger(CrfTagger):
""""""
model = 'models/pos_crf_wsj_nocluster-1.0.pickle'
tag_type = POS_TAG_TYPE
clusters = False
def _get_features(self, tokens, i):
""""""
token = tokens[i]
w = self.lexicon[token]
features = [
'w.shape=%s' % w.shape,
# 'w.normalized=%s' % w.normalized,
'w.lower=%s' % w.lower,
'w.length=%s' % w.length,
]
if w.like_number:
features.append('w.like_number')
elif w.is_punct:
features.append('w.is_punct')
# elif w.like_url:
# features.append('w.like_url')
else:
features.extend([
'w.suffix1=%s' % w.lower[-1:],
'w.suffix2=%s' % w.lower[-2:],
'w.suffix3=%s' % w.lower[-3:],
'w.suffix4=%s' % w.lower[-4:],
'w.suffix5=%s' % w.lower[-5:],
'w.prefix1=%s' % w.lower[:1],
'w.prefix2=%s' % w.lower[:2],
'w.prefix3=%s' % w.lower[:3],
'w.prefix4=%s' % w.lower[:4],
'w.prefix5=%s' % w.lower[:5],
])
if w.is_alpha:
features.append('w.is_alpha')
elif w.is_hyphenated:
features.append('w.is_hyphenated')
if w.is_upper:
features.append('w.is_upper')
elif w.is_lower:
features.append('w.is_lower')
elif w.is_title:
features.append('w.is_title')
if self.clusters and w.cluster:
features.extend([
'w.cluster4=%s' % w.cluster[:4],
'w.cluster6=%s' % w.cluster[:6],
'w.cluster10=%s' % w.cluster[:10],
'w.cluster20=%s' % w.cluster[:20],
])
# Add features for previous tokens if present
if i > 0:
p1token = tokens[i-1]
p1 = self.lexicon[p1token]
features.extend([
'p1.lower=%s' % p1.lower,
'p1.lower=%s+w.lower=%s' % (p1.lower, w.lower),
'p1.shape=%s' % p1.shape,
])
if not (p1.like_number or p1.is_punct or p1.like_url):
features.append('p1:suffix3=%s' % p1.lower[-3:])
if self.clusters and p1.cluster:
features.extend([
'p1.cluster4=%s' % p1.cluster[:4],
'p1.cluster6=%s' % p1.cluster[:6],
'p1.cluster10=%s' % p1.cluster[:10],
'p1.cluster20=%s' % p1.cluster[:20],
])
if i > 1:
p2token = tokens[i-2]
p2 = self.lexicon[p2token]
features.extend([
'p2.lower=%s' % p2.lower,
'p2.lower=%s+p1.lower=%s' % (p2.lower, p1.lower),
'p2.lower=%s+p1.lower=%s+w.lower=%s' % (p2.lower, p1.lower, w.lower),
'p2.shape=%s' % p2.shape,
])
if self.clusters and p2.cluster:
features.extend([
'p2.cluster4=%s' % p2.cluster[:4],
'p2.cluster6=%s' % p2.cluster[:6],
'p2.cluster10=%s' % p2.cluster[:10],
'p2.cluster20=%s' % p2.cluster[:20],
])
# Add features for next tokens if present
end = len(tokens) - 1
if i < end:
n1token = tokens[i+1]
n1 = self.lexicon[n1token]
features.extend([
'n1.lower=%s' % n1.lower,
'w.lower=%s+n1.lower=%s' % (w.lower, n1.lower),
'n1.shape=%s' % n1.shape,
])
if not (n1.like_number or n1.is_punct or n1.like_url):
features.append('n1.suffix3=%s' % n1.lower[-3:])
if self.clusters and n1.cluster:
features.extend([
'n1.cluster4=%s' % n1.cluster[:4],
'n1.cluster6=%s' % n1.cluster[:6],
'n1.cluster10=%s' % n1.cluster[:10],
'n1.cluster20=%s' % n1.cluster[:20],
])
if i < end - 1:
n2token = tokens[i+2]
n2 = self.lexicon[n2token]
features.extend([
'n2.lower=%s' % n2.lower,
'n1.lower=%s+n2.lower=%s' % (n1.lower, n2.lower),
'w.lower=%s+n1.lower=%s+n2.lower=%s' % (w.lower, n1.lower, n2.lower),
'n2.shape=%s' % n2.shape,
])
if self.clusters and n2.cluster:
features.extend([
'n2.cluster4=%s' % n2.cluster[:4],
'n2.cluster6=%s' % n2.cluster[:6],
'n2.cluster10=%s' % n2.cluster[:10],
'n2.cluster20=%s' % n2.cluster[:20],
])
if i == 0:
features.append('-firsttoken-')
elif i == 1:
features.append('-secondtoken-')
elif i == end - 1:
features.append('-secondlasttoken-')
elif i == end:
features.append('-lasttoken-')
return features
[docs]class ChemCrfPosTagger(CrfPosTagger):
""""""
model = 'models/pos_crf_wsj_genia-1.0.pickle'
tag_type = POS_TAG_TYPE
lexicon = ChemLexicon()
clusters = True