Source code for chemdataextractor.scrape.fields

# -*- coding: utf-8 -*-
"""
Fields to define on an entity.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import re

import dateutil.parser
import six

from .base import BaseField
from ..text.processors import strip_querystring


log = logging.getLogger(__name__)


[docs]class StringField(BaseField):
    """A string field."""

[docs]    def __init__(self, selection, lower=False, upper=False, strip=False, **kwargs):
        """

        :param bool lower: (Optional) Whether to lowercase the string. Default False.
        :param bool upper: (Optional) Whether to uppercase the string. Default False.
        :param bool strip: (Optional) Whether to strip whitespace from start/end. Default False.
        """
        super(StringField, self).__init__(selection, **kwargs)
        self.lower = lower
        self.upper = upper
        self.strip = strip

[docs]    def process(self, value):
        value = super(StringField, self).process(value)
        if value is not None:
            if self.strip:
                value = value.strip()
            if self.lower:
                value = value.lower()
            if self.upper:
                value = value.upper()
        return value


[docs]class UrlField(StringField):
    """A field with optional URL processing."""

[docs]    def __init__(self, selection, strip_querystring=False, **kwargs):
        """

        :param strip_querystring: (Optional) Whether to remove the querystring. Default False.
        """
        self.strip_querystring = strip_querystring
        super(UrlField, self).__init__(selection, **kwargs)

[docs]    def process(self, value):
        value = super(UrlField, self).process(value)
        if value is not None and self.strip_querystring:
            value = strip_querystring(value)
        return value


[docs]class EntityField(BaseField):
    """A field that contains another Entity."""

[docs]    def __init__(self, entity, selection, **kwargs):
        """

        :param entity: The embedded entity.
        """
        self.entity = entity
        super(EntityField, self).__init__(selection, **kwargs)

[docs]    def scrape(self, selector, cleaner=None, processor=None):
        """Scrape the value for this field from the selector."""
        value = self.entity.scrape(selector, root=self.selection, xpath=self.xpath)
        return self._post_scrape(value, processor=processor)


[docs]class IntField(BaseField):
    """An integer number field."""

[docs]    def process(self, value):
        """Convert value to an int."""
        try:
            return int(value)
        except (ValueError, TypeError):
            return None


[docs]class FloatField(BaseField):
    """An floating point number field."""

[docs]    def process(self, value):
        """Convert value to a float."""
        try:
            return float(value)
        except (ValueError, TypeError):
            return None


[docs]class BoolField(BaseField):
    """A boolean field type."""

[docs]    def __init__(self, selection, true=re.compile('true|yes|1', re.I), false=re.compile('false|no|0', re.I), **kwargs):
        """

        :param true: Regular expression match that evaluates to True.
        :param false: Regular expression match that evaluates to False.
        """
        self.true = re.compile(true, re.U) if isinstance(true, six.string_types) else true
        self.false = re.compile(false, re.U) if isinstance(false, six.string_types) else false
        super(BoolField, self).__init__(selection, **kwargs)

[docs]    def process(self, value):
        if self.true.match(value):
            return True
        elif self.false.match(value):
            return False
        return None


[docs]class DateTimeField(BaseField):
    """A datetime field. Depends on python-dateutil."""

[docs]    def process(self, value):
        if value == '':
            return None
        try:
            # Ignore year-only values
            if 32 < float(value) < 9999:
                return None
        except ValueError:
            pass
        try:
            return dateutil.parser.parse(value)
        except (TypeError, ValueError):
            return None

[docs]    def serialize(self, value):
        return six.text_type(value.isoformat())