# -*- coding: utf-8 -*-
"""
Tools for loading and caching data files.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import io
import logging
import os
import appdirs
import requests
import six
from .config import config
from .errors import ModelNotFoundError
from .utils import python_2_unicode_compatible, ensure_dir
log = logging.getLogger(__name__)
SERVER_ROOT = 'http://data.chemdataextractor.org/'
[docs]@python_2_unicode_compatible
class Package(object):
"""Data package."""
[docs] def __init__(self, path):
self.path = path
@property
def remote_path(self):
""""""
return SERVER_ROOT + self.path
@property
def local_path(self):
""""""
return find_data(self.path, warn=False)
[docs] def remote_exists(self):
""""""
r = requests.get(self.remote_path)
if r.status_code in {400, 401, 403, 404}:
return False
return True
[docs] def local_exists(self):
""""""
if os.path.isfile(self.local_path):
return True
return False
[docs] def download(self, force=False):
""""""
log.debug('Considering %s', self.remote_path)
ensure_dir(os.path.dirname(self.local_path))
r = requests.get(self.remote_path, stream=True)
r.raise_for_status()
# Check if already downloaded
if self.local_exists():
# Skip if existing, unless the file has changed
if not force and os.path.getsize(self.local_path) == int(r.headers['content-length']):
log.debug('Skipping existing: %s', self.local_path)
return False
else:
log.debug('File size mismatch for %s', self.local_path)
log.info('Downloading %s to %s', self.remote_path, self.local_path)
with io.open(self.local_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024*1024): # Large 10MB chunks
if chunk:
f.write(chunk)
return True
def __repr__(self):
return '<Package: %s>' % self.path
def __str__(self):
return '<Package: %s>' % self.path
#: Current active data packages
PACKAGES = [
Package('models/cem_crf-1.0.pickle'),
Package('models/cem_crf_chemdner_cemp-1.0.pickle'),
Package('models/cem_dict_cs-1.0.pickle'),
Package('models/cem_dict-1.0.pickle'),
Package('models/clusters_chem1500-1.0.pickle'),
Package('models/pos_ap_genia_nocluster-1.0.pickle'),
Package('models/pos_ap_genia-1.0.pickle'),
Package('models/pos_ap_wsj_genia_nocluster-1.0.pickle'),
Package('models/pos_ap_wsj_genia-1.0.pickle'),
Package('models/pos_ap_wsj_nocluster-1.0.pickle'),
Package('models/pos_ap_wsj-1.0.pickle'),
Package('models/pos_crf_genia_nocluster-1.0.pickle'),
Package('models/pos_crf_genia-1.0.pickle'),
Package('models/pos_crf_wsj_genia_nocluster-1.0.pickle'),
Package('models/pos_crf_wsj_genia-1.0.pickle'),
Package('models/pos_crf_wsj_nocluster-1.0.pickle'),
Package('models/pos_crf_wsj-1.0.pickle'),
Package('models/punkt_chem-1.0.pickle')
]
[docs]def get_data_dir():
"""Return path to the data directory."""
# Use data_dir config value if set, otherwise use OS-dependent data directory given by appdirs
return config.get('data_dir', appdirs.user_data_dir('ChemDataExtractor'))
[docs]def find_data(path, warn=True):
"""Return the absolute path to a data file within the data directory."""
full_path = os.path.join(get_data_dir(), path)
if warn and not os.path.isfile(full_path):
for package in PACKAGES:
if path == package.path:
log.warn('%s doesn\'t exist. Run `cde data download` to get it.' % path)
break
return full_path
#: A dictionary used to cache models so they only need to be loaded once.
_model_cache = {}
[docs]def load_model(path):
"""Load a model from a pickle file in the data directory. Cached so model is only loaded once."""
abspath = find_data(path)
cached = _model_cache.get(abspath)
if cached is not None:
log.debug('Using cached copy of %s' % path)
return cached
log.debug('Loading model %s' % path)
try:
with io.open(abspath, 'rb') as f:
model = six.moves.cPickle.load(f)
except IOError:
raise ModelNotFoundError('Could not load %s. Have you run `cde data download`?' % path)
_model_cache[abspath] = model
return model