Source code for chemdataextractor.nlp.corpus

# -*- coding: utf-8 -*-
"""
Tools for reading and writing text corpora.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import gc

from nltk.corpus import ChunkedCorpusReader, TaggedCorpusReader, PlaintextCorpusReader, BracketParseCorpusReader
from nltk.corpus.reader.util import read_line_block, tagged_treebank_para_block_reader
from nltk.tokenize import RegexpTokenizer


[docs]class LazyCorpusLoader(object): """Derived from NLTK LazyCorpusLoader."""
[docs] def __init__(self, name, reader_cls, *args, **kwargs): from nltk.corpus.reader.api import CorpusReader assert issubclass(reader_cls, CorpusReader) self.__name = self.__name__ = name self.__reader_cls = reader_cls self.__args = args self.__kwargs = kwargs
def __load(self): # Find the corpus root directory. corpus = self.__reader_cls(*self.__args, **self.__kwargs) args, kwargs = self.__args, self.__kwargs name, reader_cls = self.__name, self.__reader_cls self.__dict__ = corpus.__dict__ self.__class__ = corpus.__class__ def _unload(self): lazy_reader = LazyCorpusLoader(name, reader_cls, *args, **kwargs) self.__dict__ = lazy_reader.__dict__ self.__class__ = lazy_reader.__class__ gc.collect() self._unload = _make_bound_method(_unload, self) def __getattr__(self, attr): if attr == '__bases__': raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'") self.__load() return getattr(self, attr) def __repr__(self): return '<%s in %r (not loaded yet)>' % (self.__reader_cls.__name__, '.../corpora/'+self.__name) def _unload(self): pass
def _make_bound_method(func, self): """Magic for creating bound methods (used for _unload).""" class Foo(object): def meth(self): pass f = Foo() bound_method = type(f.meth) try: return bound_method(func, self, self.__class__) except TypeError: # python3 return bound_method(func, self) def _read_chemdner_line_block(stream): toks = [] for i in range(20): line = stream.readline() if not line: return toks pmid, title, abstract = line.split('\t') toks.append(title.strip()) toks.append(abstract.strip()) return toks #: Entire WSJ corpus (English News Text Treebank: Penn Treebank Revised, LDC2015T13) wsj = LazyCorpusLoader( 'wsj_training', BracketParseCorpusReader, 'data/eng_news_txt_tbnk-ptb_revised/data/penntree', r'\d\d/wsj_.*\.tree', encoding='ascii' ) #: WSJ corpus sections 0-18 (English News Text Treebank: Penn Treebank Revised, LDC2015T13) wsj_training = LazyCorpusLoader( 'wsj_training', BracketParseCorpusReader, 'data/eng_news_txt_tbnk-ptb_revised/data/penntree', r'(00|01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18)/wsj_.*\.tree', encoding='ascii' ) #: WSJ corpus sections 19-21 (English News Text Treebank: Penn Treebank Revised, LDC2015T13) wsj_development = LazyCorpusLoader( 'wsj_development', BracketParseCorpusReader, 'data/eng_news_txt_tbnk-ptb_revised/data/penntree', r'(19|20|21)/wsj_.*\.tree', encoding='ascii' ) #: WSJ corpus sections 22-24 (English News Text Treebank: Penn Treebank Revised, LDC2015T13) wsj_evaluation = LazyCorpusLoader( 'wsj_evaluation', BracketParseCorpusReader, 'data/eng_news_txt_tbnk-ptb_revised/data/penntree', r'(22|23|24)/wsj_.*\.tree', encoding='ascii' ) #: WSJ corpus sections 0-18 (treebank2) treebank2_training = LazyCorpusLoader( 'treebank2_training', ChunkedCorpusReader, 'data/wsj-pos-training', r'wsj_.*\.pos', sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True), para_block_reader=tagged_treebank_para_block_reader, encoding='ascii' ) #: WSJ corpus sections 19-21 (treebank2) treebank2_development = LazyCorpusLoader( 'treebank2_development', ChunkedCorpusReader, 'data/wsj-pos-development', r'wsj_.*\.pos', sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True), para_block_reader=tagged_treebank_para_block_reader, encoding='ascii' ) #: WSJ corpus sections 22-24 (treebank2) treebank2_evaluation = LazyCorpusLoader( 'treebank2_evaluation', ChunkedCorpusReader, 'data/wsj-pos-evaluation', r'wsj_.*\.pos', sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True), para_block_reader=tagged_treebank_para_block_reader, encoding='ascii' ) #: First 80% of GENIA POS-tagged corpus genia_training = LazyCorpusLoader( 'genia_training', TaggedCorpusReader, 'data/genia-pos-training', 'genia-pos-training.txt', word_tokenizer=RegexpTokenizer(r'\n', gaps=True), sent_tokenizer=RegexpTokenizer('====================\n', gaps=True) ) #: Last 20% of GENIA POS-tagged corpus genia_evaluation = LazyCorpusLoader( 'genia_evaluation', TaggedCorpusReader, 'data/genia-pos-evaluation', 'genia-pos-evaluation.txt', word_tokenizer=RegexpTokenizer(r'\n', gaps=True), sent_tokenizer=RegexpTokenizer('====================\n', gaps=True) ) #: medpost = LazyCorpusLoader( 'medpost', TaggedCorpusReader, 'data/medpost', 'tag_.+\.pos', ) #: medpost_training = LazyCorpusLoader( 'medpost_training', TaggedCorpusReader, 'data/medpost-pos-training', 'medpost-pos-training.txt', ) #: medpost_evaluation = LazyCorpusLoader( 'medpost_evaluation', TaggedCorpusReader, 'data/medpost-pos-evaluation', 'medpost-pos-evaluation.txt', ) #: cde_tokensc = LazyCorpusLoader( 'cde_tokensc', PlaintextCorpusReader, 'data/cde-tokens', 'cde-tokens-norm.txt', word_tokenizer=RegexpTokenizer(r' ', gaps=True), sent_tokenizer=RegexpTokenizer('\n', gaps=True), para_block_reader=read_line_block ) #: chemdner_training = LazyCorpusLoader( 'chemdner_training', PlaintextCorpusReader, 'data/cde-ner', 'training.txt', word_tokenizer=RegexpTokenizer(r' ', gaps=True), sent_tokenizer=RegexpTokenizer('\n', gaps=True), para_block_reader=_read_chemdner_line_block )