Source code for chemdataextractor.scrape.fields

# -*- coding: utf-8 -*-
"""
Fields to define on an entity.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import re

import dateutil.parser
import six

from .base import BaseField
from ..text.processors import strip_querystring


log = logging.getLogger(__name__)


[docs]class StringField(BaseField): """A string field."""
[docs] def __init__(self, selection, lower=False, upper=False, strip=False, **kwargs): """ :param bool lower: (Optional) Whether to lowercase the string. Default False. :param bool upper: (Optional) Whether to uppercase the string. Default False. :param bool strip: (Optional) Whether to strip whitespace from start/end. Default False. """ super(StringField, self).__init__(selection, **kwargs) self.lower = lower self.upper = upper self.strip = strip
[docs] def process(self, value): value = super(StringField, self).process(value) if value is not None: if self.strip: value = value.strip() if self.lower: value = value.lower() if self.upper: value = value.upper() return value
[docs]class UrlField(StringField): """A field with optional URL processing."""
[docs] def __init__(self, selection, strip_querystring=False, **kwargs): """ :param strip_querystring: (Optional) Whether to remove the querystring. Default False. """ self.strip_querystring = strip_querystring super(UrlField, self).__init__(selection, **kwargs)
[docs] def process(self, value): value = super(UrlField, self).process(value) if value is not None and self.strip_querystring: value = strip_querystring(value) return value
[docs]class EntityField(BaseField): """A field that contains another Entity."""
[docs] def __init__(self, entity, selection, **kwargs): """ :param entity: The embedded entity. """ self.entity = entity super(EntityField, self).__init__(selection, **kwargs)
[docs] def scrape(self, selector, cleaner=None, processor=None): """Scrape the value for this field from the selector.""" value = self.entity.scrape(selector, root=self.selection, xpath=self.xpath) return self._post_scrape(value, processor=processor)
[docs]class IntField(BaseField): """An integer number field."""
[docs] def process(self, value): """Convert value to an int.""" try: return int(value) except (ValueError, TypeError): return None
[docs]class FloatField(BaseField): """An floating point number field."""
[docs] def process(self, value): """Convert value to a float.""" try: return float(value) except (ValueError, TypeError): return None
[docs]class BoolField(BaseField): """A boolean field type."""
[docs] def __init__(self, selection, true=re.compile('true|yes|1', re.I), false=re.compile('false|no|0', re.I), **kwargs): """ :param true: Regular expression match that evaluates to True. :param false: Regular expression match that evaluates to False. """ self.true = re.compile(true, re.U) if isinstance(true, six.string_types) else true self.false = re.compile(false, re.U) if isinstance(false, six.string_types) else false super(BoolField, self).__init__(selection, **kwargs)
[docs] def process(self, value): if self.true.match(value): return True elif self.false.match(value): return False return None
[docs]class DateTimeField(BaseField): """A datetime field. Depends on python-dateutil."""
[docs] def process(self, value): if value == '': return None try: # Ignore year-only values if 32 < float(value) < 9999: return None except ValueError: pass try: return dateutil.parser.parse(value) except (TypeError, ValueError): return None
[docs] def serialize(self, value): return six.text_type(value.isoformat())