Source code for chemdataextractor.parse.actions

# -*- coding: utf-8 -*-
"""
Actions to perform during parsing.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import re

from lxml.builder import E
from lxml.etree import strip_tags

from ..text import HYPHENS


log = logging.getLogger(__name__)


[docs]def flatten(tokens, start, result):
    """Replace all child results with their text contents."""
    for e in result:
        strip_tags(e, '*')
    return result


[docs]def join(tokens, start, result):
    """Join tokens into a single string with spaces between."""
    texts = []
    if len(result) > 0:
        for e in result:
            for child in e.iter():
                if child.text is not None:
                    texts.append(child.text)
        return [E(result[0].tag, ' '.join(texts))]


[docs]def merge(tokens, start, result):
    """Join tokens into a single string with no spaces."""
    texts = []
    if len(result) > 0:
        for e in result:
            for child in e.iter():
                if child.text is not None:
                    texts.append(child.text)
        return [E(result[0].tag, ''.join(texts))]


[docs]def strip_stop(tokens, start, result):
    """Remove trailing full stop from tokens."""
    for e in result:
        for child in e.iter():
            if child.text.endswith('.'):
                child.text = child.text[:-1]
    return result


[docs]def fix_whitespace(tokens, start, result):
    """Fix whitespace around hyphens and commas. Can be used to remove whitespace tokenization artefacts."""
    for e in result:
        for child in e.iter():
            # if check added by Juraj, it has to exist
            if child.text:
                child.text = child.text.replace(' , ', ', ')
                for hyphen in HYPHENS:
                    child.text = child.text.replace(' %s ' % hyphen, '%s' % hyphen)
                child.text = re.sub(r'- (.) -', r'-\1-', child.text)
                child.text = child.text.replace(" -", "-")
                child.text = child.text.replace(" : ", ":").replace(" ) ", ")")

                child.text = child.text.replace(" ( ", "(").replace(" ) ", ")")
                child.text = child.text.replace(" / ", "/")
                child.text = child.text.replace(" [ ", "[").replace(" ] ", "]")
                child.text = child.text.replace("( ", "(").replace(" )", ")")
                child.text = child.text.replace("[ ", "[").replace(" ]", "]")
    return result