Source code for chemdataextractor.reader.plaintext

# -*- coding: utf-8 -*-
"""
Plain text document reader.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import re

import six

from ..doc.document import Document
from .base import BaseReader
from ..text import get_encoding


[docs]class PlainTextReader(BaseReader): """Read plain text and split into Paragraphs based on newline patterns."""
[docs] def detect(self, fstring, fname=None): """Have a stab at most files.""" if fname is not None and '.' in fname: extension = fname.rsplit('.', 1)[1] if extension in {'pdf', 'html', 'xml'}: return False return True
[docs] def parse(self, fstring): if isinstance(fstring, six.binary_type): fstring = fstring.decode(get_encoding(fstring)) para_strings = [p.strip() for p in re.split(r'\r\n[ \t]*\r\n|\r[ \t]*\r|\n[ \t]*\n', fstring)] return Document(*para_strings)