Source code for chemdataextractor.relex.entity

# -*- coding: utf-8 -*-
"""
Extraction pattern object
"""
import copy
import six
from ..parse import Group, join

[docs]class Entity(object): """A base entity, the fundamental unit of a Relation """
[docs] def __init__(self, text, tag, parse_expression, start ,end): """Create a new Entity Arguments: text {str} -- The text of the entity tag {str or list} -- name of the entity parse_expression -- how the entity is identified in text start {int} -- The index of the Entity in tokens end {int} -- The end index of the entity in tokens """ self.text = six.text_type(text) self.tag = tag self.parse_expression = copy.copy(parse_expression) self.parse_expression.set_name(None) if self.parse_expression.name is None or self.parse_expression.name == 'compound': if isinstance(self.tag, tuple): for sub_tag in self.tag: self.parse_expression = Group(self.parse_expression)(sub_tag) else: self.parse_expression = Group(self.parse_expression)(self.tag).add_action(join) self.end = end self.start = start
def __eq__(self, other): if self.text == other.text and self.end == other.end and self.start == other.start: return True else: return False def __repr__(self): if isinstance(self.tag, str): return '(' + self.text + ',' + self.tag + ',' + str(self.start) + ',' + str(self.end) + ')' else: return '(' + self.text + ',' + '_'.join([i for i in self.tag]) + ',' + str(self.start) + ',' + str(self.end) + ')' def __str__(self): return self.__repr__()
[docs] def serialize(self): output = current = {} if '__' in self.tag: tag = [i for i in self.tag.split('__')] for i, t in enumerate(tag): if i == len(tag)-1: current[t] = self.text else: current[t] = {} current = current[t] else: output[self.tag] = self.text return output