Source code for chemdataextractor.parse.actions

# -*- coding: utf-8 -*-
"""
Actions to perform during parsing.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import re

from lxml.builder import E
from lxml.etree import strip_tags

from ..text import HYPHENS


log = logging.getLogger(__name__)


[docs]def flatten(tokens, start, result): """Replace all child results with their text contents.""" for e in result: strip_tags(e, '*') return result
[docs]def join(tokens, start, result): """Join tokens into a single string with spaces between.""" texts = [] if len(result) > 0: for e in result: for child in e.iter(): if child.text is not None: texts.append(child.text) return [E(result[0].tag, ' '.join(texts))]
[docs]def merge(tokens, start, result): """Join tokens into a single string with no spaces.""" texts = [] if len(result) > 0: for e in result: for child in e.iter(): if child.text is not None: texts.append(child.text) return [E(result[0].tag, ''.join(texts))]
[docs]def strip_stop(tokens, start, result): """Remove trailing full stop from tokens.""" for e in result: for child in e.iter(): if child.text.endswith('.'): child.text = child.text[:-1] return result
[docs]def fix_whitespace(tokens, start, result): """Fix whitespace around hyphens and commas. Can be used to remove whitespace tokenization artefacts.""" for e in result: for child in e.iter(): # if check added by Juraj, it has to exist if child.text: child.text = child.text.replace(' , ', ', ') for hyphen in HYPHENS: child.text = child.text.replace(' %s ' % hyphen, '%s' % hyphen) child.text = re.sub(r'- (.) -', r'-\1-', child.text) child.text = child.text.replace(" -", "-") child.text = child.text.replace(" : ", ":").replace(" ) ", ")") child.text = child.text.replace(" ( ", "(").replace(" ) ", ")") child.text = child.text.replace(" / ", "/") child.text = child.text.replace(" [ ", "[").replace(" ] ", "]") child.text = child.text.replace("( ", "(").replace(" )", ")") child.text = child.text.replace("[ ", "[").replace(" ]", "]") return result