Source code for aser.extract.aser_extractor

from copy import copy, deepcopy
from itertools import chain
from .eventuality_extractor import SeedRuleEventualityExtractor, DiscourseEventualityExtractor
from .relation_extractor import SeedRuleRelationExtractor, DiscourseRelationExtractor
from .utils import parse_sentense_with_stanford, get_corenlp_client
from .utils import ANNOTATORS


[docs]class BaseASERExtractor(object): """ Base ASER Extractor to extract both eventualities and relations. It includes an instance of `BaseEventualityExtractor` and an instance of `BaseRelationExtractor`. """ def __init__(self, corenlp_path="", corenlp_port=0, **kw): """ :param corenlp_path: corenlp path, e.g., /home/xliucr/stanford-corenlp-3.9.2 :type corenlp_path: str (default = "") :param corenlp_port: corenlp port, e.g., 9000 :type corenlp_port: int (default = 0) :param kw: other parameters :type kw: Dict[str, object] """ self.corenlp_path = corenlp_path self.corenlp_port = corenlp_port self.annotators = kw.get("annotators", list(ANNOTATORS)) _, self.is_externel_corenlp = get_corenlp_client(corenlp_path=self.corenlp_path, corenlp_port=self.corenlp_port) self.eventuality_extractor = None self.relation_extractor = None
[docs] def close(self): """ Close the extractor safely """ if not self.is_externel_corenlp: corenlp_client, _ = get_corenlp_client(corenlp_path=self.corenlp_path, corenlp_port=self.corenlp_port) corenlp_client.stop() if self.eventuality_extractor: self.eventuality_extractor.close() if self.relation_extractor: self.relation_extractor.close()
def __del__(self): self.close()
[docs] def parse_text(self, text, annotators=None): """ Parse a raw text by corenlp :param text: a raw text :type text: str :param annotators: annotators for corenlp, please refer to https://stanfordnlp.github.io/CoreNLP/annotators.html :type annotators: Union[List, None] (default = None) :return: the parsed result :rtype: List[Dict[str, object]] .. highlight:: python .. code-block:: python Input: "My army will find your boat. In the meantime, I'm sure we could find you suitable accommodations." Output: [{'dependencies': [(1, 'nmod:poss', 0), (3, 'nsubj', 1), (3, 'aux', 2), (3, 'dobj', 5), (3, 'punct', 6), (5, 'nmod:poss', 4)], 'lemmas': ['my', 'army', 'will', 'find', 'you', 'boat', '.'], 'mentions': [], 'ners': ['O', 'O', 'O', 'O', 'O', 'O', 'O'], 'parse': '(ROOT (S (NP (PRP$ My) (NN army)) (VP (MD will) (VP (VB find) (NP ' '(PRP$ your) (NN boat)))) (. .)))', 'pos_tags': ['PRP$', 'NN', 'MD', 'VB', 'PRP$', 'NN', '.'], 'text': 'My army will find your boat.', 'tokens': ['My', 'army', 'will', 'find', 'your', 'boat', '.']}, {'dependencies': [(2, 'case', 0), (2, 'det', 1), (6, 'nmod:in', 2), (6, 'punct', 3), (6, 'nsubj', 4), (6, 'cop', 5), (6, 'ccomp', 9), (6, 'punct', 13), (9, 'nsubj', 7), (9, 'aux', 8), (9, 'iobj', 10), (9, 'dobj', 12), (12, 'amod', 11)], 'lemmas': ['in', 'the', 'meantime', ',', 'I', 'be', 'sure', 'we', 'could', 'find', 'you', 'suitable', 'accommodation', '.'], 'mentions': [], 'ners': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'parse': '(ROOT (S (PP (IN In) (NP (DT the) (NN meantime))) (, ,) (NP (PRP ' "I)) (VP (VBP 'm) (ADJP (JJ sure) (SBAR (S (NP (PRP we)) (VP (MD " 'could) (VP (VB find) (NP (PRP you)) (NP (JJ suitable) (NNS ' 'accommodations)))))))) (. .)))', 'pos_tags': ['IN', 'DT', 'NN', ',', 'PRP', 'VBP', 'JJ', 'PRP', 'MD', 'VB', 'PRP', 'JJ', 'NNS', '.'], 'text': "In the meantime, I'm sure we could find you suitable " 'accommodations.', 'tokens': ['In', 'the', 'meantime', ',', 'I', "'m", 'sure', 'we', 'could', 'find', 'you', 'suitable', 'accommodations', '.']}] """ if annotators is None: annotators = self.annotators corenlp_client, _ = get_corenlp_client( corenlp_path=self.corenlp_path, corenlp_port=self.corenlp_port, annotators=annotators ) parsed_result = parse_sentense_with_stanford(text, corenlp_client, self.annotators) return parsed_result
[docs] def extract_eventualities_from_parsed_result(self, parsed_result, output_format="Eventuality", in_order=True, use_lemma=True, **kw): """ Extract eventualities from the parsed result :param parsed_result: the parsed result returned by corenlp :type parsed_result: List[Dict[str, object]] :param output_format: which format to return, "Eventuality" or "json" :type output_format: str (default = "Eventuality") :param in_order: whether the returned order follows the input token order :type in_order: bool (default = True) :param use_lemma: whether the returned eventuality uses lemma :type use_lemma: bool (default = True) :param kw: other parameters :type kw: Dict[str, object] :return: the extracted eventualities :rtype: Union[List[List[aser.eventuality.Eventuality]], List[List[Dict[str, object]]], List[aser.eventuality.Eventuality], List[Dict[str, object]]] .. highlight:: python .. code-block:: python Input: [{'dependencies': [(1, 'nmod:poss', 0), (3, 'nsubj', 1), (3, 'aux', 2), (3, 'dobj', 5), (3, 'punct', 6), (5, 'nmod:poss', 4)], 'lemmas': ['my', 'army', 'will', 'find', 'you', 'boat', '.'], 'mentions': [], 'ners': ['O', 'O', 'O', 'O', 'O', 'O', 'O'], 'parse': '(ROOT (S (NP (PRP$ My) (NN army)) (VP (MD will) (VP (VB find) (NP ' '(PRP$ your) (NN boat)))) (. .)))', 'pos_tags': ['PRP$', 'NN', 'MD', 'VB', 'PRP$', 'NN', '.'], 'text': 'My army will find your boat.', 'tokens': ['My', 'army', 'will', 'find', 'your', 'boat', '.']}, {'dependencies': [(2, 'case', 0), (2, 'det', 1), (6, 'nmod:in', 2), (6, 'punct', 3), (6, 'nsubj', 4), (6, 'cop', 5), (6, 'ccomp', 9), (6, 'punct', 13), (9, 'nsubj', 7), (9, 'aux', 8), (9, 'iobj', 10), (9, 'dobj', 12), (12, 'amod', 11)], 'lemmas': ['in', 'the', 'meantime', ',', 'I', 'be', 'sure', 'we', 'could', 'find', 'you', 'suitable', 'accommodation', '.'], 'mentions': [], 'ners': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'parse': '(ROOT (S (PP (IN In) (NP (DT the) (NN meantime))) (, ,) (NP (PRP ' "I)) (VP (VBP 'm) (ADJP (JJ sure) (SBAR (S (NP (PRP we)) (VP (MD " 'could) (VP (VB find) (NP (PRP you)) (NP (JJ suitable) (NNS ' 'accommodations)))))))) (. .)))', 'pos_tags': ['IN', 'DT', 'NN', ',', 'PRP', 'VBP', 'JJ', 'PRP', 'MD', 'VB', 'PRP', 'JJ', 'NNS', '.'], 'text': "In the meantime, I'm sure we could find you suitable " 'accommodations.', 'tokens': ['In', 'the', 'meantime', ',', 'I', "'m", 'sure', 'we', 'could', 'find', 'you', 'suitable', 'accommodations', '.']}] Output: [[my army will find you boat], [i be sure, we could find you suitable accommodation]] """ if output_format not in ["Eventuality", "json"]: raise ValueError( "Error: extract_eventualities_from_parsed_result only supports Eventuality or json." ) return self.eventuality_extractor.extract_from_parsed_result( parsed_result, output_format=output_format, in_order=in_order, use_lemma=use_lemma, **kw )
[docs] def extract_eventualities_from_text(self, text, output_format="Eventuality", in_order=True, use_lemma=True, annotators=None, **kw): """ Extract eventualities from a raw text :param text: a raw text :type text: str :param output_format: which format to return, "Eventuality" or "json" :type output_format: str (default = "Eventuality") :param in_order: whether the returned order follows the input token order :type in_order: bool (default = True) :param use_lemma: whether the returned eventuality uses lemma :type use_lemma: bool (default = True) :param annotators: annotators for corenlp, please refer to https://stanfordnlp.github.io/CoreNLP/annotators.html :type annotators: Union[List, None] (default = None) :param kw: other parameters :type kw: Dict[str, object] :return: the extracted eventualities :rtype: Union[List[List[aser.eventuality.Eventuality]], List[List[Dict[str, object]]], List[aser.eventuality.Eventuality], List[Dict[str, object]]] .. highlight:: python .. code-block:: python Input: "My army will find your boat. In the meantime, I'm sure we could find you suitable accommodations." Output: [[my army will find you boat], [i be sure, we could find you suitable accommodation]] """ if output_format not in ["Eventuality", "json"]: raise NotImplementedError("Error: extract_eventualities_from_text only supports Eventuality or json.") parsed_result = self.parse_text(text, annotators=annotators) return self.extract_eventualities_from_parsed_result( parsed_result, output_format=output_format, in_order=in_order, use_lemma=use_lemma, **kw )
[docs] def extract_relations_from_parsed_result( self, parsed_result, para_eventualities, output_format="Relation", in_order=True, **kw ): """ Extract relations from a parsed result (of a paragraph) and extracted eventualities :param parsed_result: the parsed result returned by corenlp :type parsed_result: List[Dict[str, object]] :param para_eventualities: eventualities in the paragraph :type para_eventualities: List[aser.eventuality.Eventuality] :param output_format: which format to return, "Relation" or "triplet" :type output_format: str (default = "Relation") :param in_order: whether the returned order follows the input token order :type in_order: bool (default = True) :param kw: other parameters :type kw: Dict[str, object] :return: the extracted relations :rtype: Union[List[List[aser.relation.Relation]], List[List[Dict[str, object]]], List[aser.relation.Relation], List[Dict[str, object]]] .. highlight:: python .. code-block:: python Input: [{'dependencies': [(1, 'nmod:poss', 0), (3, 'nsubj', 1), (3, 'aux', 2), (3, 'dobj', 5), (3, 'punct', 6), (5, 'nmod:poss', 4)], 'lemmas': ['my', 'army', 'will', 'find', 'you', 'boat', '.'], 'mentions': [], 'ners': ['O', 'O', 'O', 'O', 'O', 'O', 'O'], 'parse': '(ROOT (S (NP (PRP$ My) (NN army)) (VP (MD will) (VP (VB find) (NP ' '(PRP$ your) (NN boat)))) (. .)))', 'pos_tags': ['PRP$', 'NN', 'MD', 'VB', 'PRP$', 'NN', '.'], 'text': 'My army will find your boat.', 'tokens': ['My', 'army', 'will', 'find', 'your', 'boat', '.']}, {'dependencies': [(2, 'case', 0), (2, 'det', 1), (6, 'nmod:in', 2), (6, 'punct', 3), (6, 'nsubj', 4), (6, 'cop', 5), (6, 'ccomp', 9), (6, 'punct', 13), (9, 'nsubj', 7), (9, 'aux', 8), (9, 'iobj', 10), (9, 'dobj', 12), (12, 'amod', 11)], 'lemmas': ['in', 'the', 'meantime', ',', 'I', 'be', 'sure', 'we', 'could', 'find', 'you', 'suitable', 'accommodation', '.'], 'mentions': [], 'ners': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'parse': '(ROOT (S (PP (IN In) (NP (DT the) (NN meantime))) (, ,) (NP (PRP ' "I)) (VP (VBP 'm) (ADJP (JJ sure) (SBAR (S (NP (PRP we)) (VP (MD " 'could) (VP (VB find) (NP (PRP you)) (NP (JJ suitable) (NNS ' 'accommodations)))))))) (. .)))', 'pos_tags': ['IN', 'DT', 'NN', ',', 'PRP', 'VBP', 'JJ', 'PRP', 'MD', 'VB', 'PRP', 'JJ', 'NNS', '.'], 'text': "In the meantime, I'm sure we could find you suitable " 'accommodations.', 'tokens': ['In', 'the', 'meantime', ',', 'I', "'m", 'sure', 'we', 'could', 'find', 'you', 'suitable', 'accommodations', '.']}], [[my army will find you boat], [i be sure, we could find you suitable accommodation]] Output: [[], [(7d9ea9023b66a0ebc167f0dbb6ea8cd75d7b46f9, 25edad6781577dcb3ba715c8230416fb0d4c45c4, {'Co_Occurrence': 1.0})], [(8540897b645962964fd644242d4cc0032f024e86, 25edad6781577dcb3ba715c8230416fb0d4c45c4, {'Synchronous': 1.0})]] """ if output_format not in ["Relation", "triplet"]: raise NotImplementedError("Error: extract_relations_from_parsed_result only supports Relation or triplet.") return self.relation_extractor.extract_from_parsed_result( parsed_result, para_eventualities, output_format=output_format, in_order=in_order, **kw )
[docs] def extract_relations_from_text(self, text, output_format="Relation", in_order=True, annotators=None, **kw): """ Extract relations from a raw text and extracted eventualities :param text: a raw text :type text: str :param output_format: which format to return, "Relation" or "triplet" :type output_format: str (default = "Relation") :param in_order: whether the returned order follows the input token order :type in_order: bool (default = True) :param annotators: annotators for corenlp, please refer to https://stanfordnlp.github.io/CoreNLP/annotators.html :type annotators: Union[List, None] (default = None) :param kw: other parameters :type kw: Dict[str, object] :return: the extracted relations :rtype: Union[List[List[aser.relation.Relation]], List[List[Dict[str, object]]], List[aser.relation.Relation], List[Dict[str, object]]] .. highlight:: python .. code-block:: python Input: "My army will find your boat. In the meantime, I'm sure we could find you suitable accommodations." Output: [[], [(7d9ea9023b66a0ebc167f0dbb6ea8cd75d7b46f9, 25edad6781577dcb3ba715c8230416fb0d4c45c4, {'Co_Occurrence': 1.0})], [(8540897b645962964fd644242d4cc0032f024e86, 25edad6781577dcb3ba715c8230416fb0d4c45c4, {'Synchronous': 1.0})]] """ if output_format not in ["Relation", "triplet"]: raise NotImplementedError("Error: extract_relations_from_text only supports Relation or triplet.") parsed_result = self.parse_text(text, annotators=annotators) para_eventualities = self.extract_eventualities_from_parsed_result(parsed_result) return self.extract_relations_from_parsed_result( parsed_result, para_eventualities, output_format=output_format, in_order=in_order, **kw )
[docs] def extract_from_parsed_result( self, parsed_result, eventuality_output_format="Eventuality", relation_output_format="Relation", in_order=True, use_lemma=True, **kw ): """ Extract both eventualities and relations from a parsed result :param parsed_result: the parsed result returned by corenlp :type parsed_result: List[Dict[str, object]] :param eventuality_output_format: which format to return eventualities, "Eventuality" or "json" :type eventuality_output_format: str (default = "Eventuality") :param relation_output_format: which format to return relations, "Relation" or "triplet" :type relation_output_format: str (default = "Relation") :param in_order: whether the returned order follows the input token order :type in_order: bool (default = True) :param use_lemma: whether the returned eventuality uses lemma :type use_lemma: bool (default = True) :param kw: other parameters :type kw: Dict[str, object] :return: the extracted eventualities and relations :rtype: Tuple[Union[List[List[aser.eventuality.Eventuality]], List[List[Dict[str, object]]], List[aser.eventuality.Eventuality], List[Dict[str, object]]], Union[List[List[aser.relation.Relation]], List[List[Dict[str, object]]], List[aser.relation.Relation], List[Dict[str, object]]]] .. highlight:: python .. code-block:: python Input: [{'dependencies': [(1, 'nmod:poss', 0), (3, 'nsubj', 1), (3, 'aux', 2), (3, 'dobj', 5), (3, 'punct', 6), (5, 'nmod:poss', 4)], 'lemmas': ['my', 'army', 'will', 'find', 'you', 'boat', '.'], 'mentions': [], 'ners': ['O', 'O', 'O', 'O', 'O', 'O', 'O'], 'parse': '(ROOT (S (NP (PRP$ My) (NN army)) (VP (MD will) (VP (VB find) (NP ' '(PRP$ your) (NN boat)))) (. .)))', 'pos_tags': ['PRP$', 'NN', 'MD', 'VB', 'PRP$', 'NN', '.'], 'text': 'My army will find your boat.', 'tokens': ['My', 'army', 'will', 'find', 'your', 'boat', '.']}, {'dependencies': [(2, 'case', 0), (2, 'det', 1), (6, 'nmod:in', 2), (6, 'punct', 3), (6, 'nsubj', 4), (6, 'cop', 5), (6, 'ccomp', 9), (6, 'punct', 13), (9, 'nsubj', 7), (9, 'aux', 8), (9, 'iobj', 10), (9, 'dobj', 12), (12, 'amod', 11)], 'lemmas': ['in', 'the', 'meantime', ',', 'I', 'be', 'sure', 'we', 'could', 'find', 'you', 'suitable', 'accommodation', '.'], 'mentions': [], 'ners': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'parse': '(ROOT (S (PP (IN In) (NP (DT the) (NN meantime))) (, ,) (NP (PRP ' "I)) (VP (VBP 'm) (ADJP (JJ sure) (SBAR (S (NP (PRP we)) (VP (MD " 'could) (VP (VB find) (NP (PRP you)) (NP (JJ suitable) (NNS ' 'accommodations)))))))) (. .)))', 'pos_tags': ['IN', 'DT', 'NN', ',', 'PRP', 'VBP', 'JJ', 'PRP', 'MD', 'VB', 'PRP', 'JJ', 'NNS', '.'], 'text': "In the meantime, I'm sure we could find you suitable " 'accommodations.', 'tokens': ['In', 'the', 'meantime', ',', 'I', "'m", 'sure', 'we', 'could', 'find', 'you', 'suitable', 'accommodations', '.']}], [[my army will find you boat], [i be sure, we could find you suitable accommodation]] Output: ([[my army will find you boat], [i be sure, we could find you suitable accommodation]], [[], [(7d9ea9023b66a0ebc167f0dbb6ea8cd75d7b46f9, 25edad6781577dcb3ba715c8230416fb0d4c45c4, {'Co_Occurrence': 1.0})], [(8540897b645962964fd644242d4cc0032f024e86, 25edad6781577dcb3ba715c8230416fb0d4c45c4, {'Synchronous': 1.0})]]) """ if eventuality_output_format not in ["Eventuality", "json"]: raise NotImplementedError("Error: extract_eventualities only supports Eventuality or json.") if relation_output_format not in ["Relation", "triplet"]: raise NotImplementedError("Error: extract_relations only supports Relation or triplet.") if not isinstance(parsed_result, (list, tuple, dict)): raise NotImplementedError if isinstance(parsed_result, dict): is_single_sent = True parsed_result = [parsed_result] else: is_single_sent = False para_eventualities = self.extract_eventualities_from_parsed_result( parsed_result, output_format="Eventuality", in_order=True, use_lemma=use_lemma, **kw ) para_relations = self.extract_relations_from_parsed_result( parsed_result, para_eventualities, output_format="Relation", in_order=True, **kw ) if in_order: if eventuality_output_format == "json": para_eventualities = [[eventuality.encode(encoding=None) for eventuality in sent_eventualities] \ for sent_eventualities in para_eventualities] if relation_output_format == "triplet": para_relations = [list(chain.from_iterable([relation.to_triplet() for relation in sent_relations])) \ for sent_relations in para_relations] if is_single_sent: return para_eventualities[0], para_relations[0] else: return para_eventualities, para_relations else: eid2eventuality = dict() for eventuality in chain.from_iterable(para_eventualities): eid = eventuality.eid if eid not in eid2eventuality: eid2eventuality[eid] = deepcopy(eventuality) else: eid2eventuality[eid].update(eventuality) if eventuality_output_format == "Eventuality": eventualities = sorted(eid2eventuality.values(), key=lambda e: e.eid) elif eventuality_output_format == "json": eventualities = sorted( [eventuality.encode(encoding=None) for eventuality in eid2eventuality.values()], key=lambda e: e["eid"] ) rid2relation = dict() for relation in chain.from_iterable(para_relations): if relation.rid not in rid2relation: rid2relation[relation.rid] = deepcopy(relation) else: rid2relation[relation.rid].update(relation) if relation_output_format == "Relation": para_relations = sorted(rid2relation.values(), key=lambda r: r.rid) elif relation_output_format == "triplet": para_relations = sorted(chain.from_iterable([relation.to_triplets() for relation in rid2relation.values()])) return eventualities, para_relations
[docs] def extract_from_text( self, text, eventuality_output_format="Eventuality", relation_output_format="Relation", in_order=True, use_lemma=True, annotators=None, **kw ): """ Extract both eventualities and relations from a raw text :param text: a raw text :type text: str :param eventuality_output_format: which format to return eventualities, "Eventuality" or "json" :type eventuality_output_format: str (default = "Eventuality") :param relation_output_format: which format to return relations, "Relation" or "triplet" :type relation_output_format: str (default = "Relation") :param in_order: whether the returned order follows the input token order :type in_order: bool (default = True) :param use_lemma: whether the returned eventuality uses lemma :type use_lemma: bool (default = True) :param annotators: annotators for corenlp, please refer to https://stanfordnlp.github.io/CoreNLP/annotators.html :type annotators: Union[List, None] (default = None) :param kw: other parameters :type kw: Dict[str, object] :return: the extracted eventualities and relations :rtype: :rtype: Tuple[Union[List[List[aser.eventuality.Eventuality]], List[List[Dict[str, object]]], List[aser.eventuality.Eventuality], List[Dict[str, object]]], Union[List[List[aser.relation.Relation]], List[List[Dict[str, object]]], List[aser.relation.Relation], List[Dict[str, object]]]] .. highlight:: python .. code-block:: python Input: "My army will find your boat. In the meantime, I'm sure we could find you suitable accommodations." Output: ([[my army will find you boat], [i be sure, we could find you suitable accommodation]], [[], [(7d9ea9023b66a0ebc167f0dbb6ea8cd75d7b46f9, 25edad6781577dcb3ba715c8230416fb0d4c45c4, {'Co_Occurrence': 1.0})], [(8540897b645962964fd644242d4cc0032f024e86, 25edad6781577dcb3ba715c8230416fb0d4c45c4, {'Synchronous': 1.0})]]) """ if eventuality_output_format not in ["Eventuality", "json"]: raise NotImplementedError("Error: extract_eventualities only supports Eventuality or json.") if relation_output_format not in ["Relation", "triplet"]: raise NotImplementedError("Error: extract_relations only supports Relation or triplet.") parsed_result = self.parse_text(text, annotators=annotators) return self.extract_from_parsed_result( parsed_result, eventuality_output_format=eventuality_output_format, relation_output_format=relation_output_format, in_order=in_order, use_lemma=use_lemma, **kw )
[docs]class SeedRuleASERExtractor(BaseASERExtractor): """ ASER Extractor based on rules to extract both eventualities and relations (for ASER v1.0) """ def __init__(self, corenlp_path="", corenlp_port=0, **kw): if "annotators" not in kw: kw["annotators"] = list(ANNOTATORS) if "parse" in kw["annotators"]: kw["annotators"].remove("parse") if "depparse" not in kw["annotators"]: kw["annotators"].append("depparse") super().__init__(corenlp_path, corenlp_port, **kw) from .rule import CLAUSE_WORDS self.eventuality_extractor = SeedRuleEventualityExtractor( corenlp_path=self.corenlp_path, corenlp_port=self.corenlp_port, skip_words=CLAUSE_WORDS, **kw ) self.relation_extractor = SeedRuleRelationExtractor(**kw)
[docs]class DiscourseASERExtractor(BaseASERExtractor): """ ASER Extractor based on discourse parsing to extract both eventualities and relations (for ASER v2.0) """ def __init__(self, corenlp_path="", corenlp_port=0, **kw): if "annotators" not in kw: kw["annotators"] = list(ANNOTATORS) if "depparse" in kw["annotators"]: kw["annotators"].remove("depparse") if "parse" not in kw["annotators"]: kw["annotators"].append("parse") super().__init__(corenlp_path, corenlp_port, **kw) self.eventuality_extractor = DiscourseEventualityExtractor( corenlp_path=self.corenlp_path, corenlp_port=self.corenlp_port, **kw ) self.relation_extractor = DiscourseRelationExtractor(**kw)
[docs] def extract_from_parsed_result( self, parsed_result, eventuality_output_format="Eventuality", relation_output_format="Relation", in_order=True, use_lemma=True, **kw ): """ Extract both eventualities and relations from a parsed result :param parsed_result: the parsed result returned by corenlp :type parsed_result: List[Dict[str, object]] :param eventuality_output_format: which format to return eventualities, "Eventuality" or "json" :type eventuality_output_format: str (default = "Eventuality") :param relation_output_format: which format to return relations, "Relation" or "triplet" :type relation_output_format: str (default = "Relation") :param in_order: whether the returned order follows the input token order :type in_order: bool (default = True) :param use_lemma: whether the returned eventuality uses lemma :type use_lemma: bool (default = True) :param kw: other parameters (e.g., syntax_tree_cache) :type kw: Dict[str, object] :return: the extracted eventualities and relations :rtype: :rtype: Tuple[Union[List[List[aser.eventuality.Eventuality]], List[List[Dict[str, object]]], List[aser.eventuality.Eventuality], List[Dict[str, object]]], Union[List[List[aser.relation.Relation]], List[List[Dict[str, object]]], List[aser.relation.Relation], List[Dict[str, object]]]] """ if "syntax_tree_cache" not in kw: kw["syntax_tree_cache"] = dict() return super().extract_from_parsed_result( parsed_result, eventuality_output_format=eventuality_output_format, relation_output_format=relation_output_format, in_order=in_order, use_lemma=use_lemma, **kw )
# The following extractor can cover more eventualities but the semantic meaning may be incomplete. # class DiscourseASERExtractor(BaseASERExtractor): # def __init__(self, corenlp_path="", corenlp_port=0, **kw): # super().__init__(corenlp_path, corenlp_port, **kw) # self.eventuality_extractor = SeedRuleEventualityExtractor(**kw) # self.conn_extractor = ConnectiveExtractor(**kw) # self.argpos_classifier = ArgumentPositionClassifier(**kw) # self.ss_extractor = SSArgumentExtractor(**kw) # self.ps_extractor = PSArgumentExtractor(**kw) # self.explicit_classifier = ExplicitSenseClassifier(**kw) # def _extract_eventualities_from_clause(self, sent_parsed_result, clause, use_lemma): # len_clause = len(clause) # idx_mapping = {j: i for i, j in enumerate(clause)} # indices_set = set(clause) # clause_parsed_result = { # "text": "", # "dependencies": [(idx_mapping[dep[0]], dep[1], idx_mapping[dep[2]]) for dep in sent_parsed_result["dependencies"] \ # if dep[0] in indices_set and dep[2] in indices_set], # "tokens": [sent_parsed_result["tokens"][idx] for idx in clause], # "pos_tags": [sent_parsed_result["pos_tags"][idx] for idx in clause], # "lemmas": [sent_parsed_result["lemmas"][idx] for idx in clause]} # if "ners" in sent_parsed_result: # clause_parsed_result["ners"] = [sent_parsed_result["ners"][idx] for idx in clause] # if "mentions" in sent_parsed_result: # clause_parsed_result["mentions"] = list() # for mention in sent_parsed_result["mentions"]: # start_idx = bisect.bisect_left(clause, mention["start"]) # if not (start_idx < len_clause and clause[start_idx] == mention["start"]): # continue # end_idx = bisect.bisect_left(clause, mention["end"]-1) # if not (end_idx < len_clause and clause[end_idx] == mention["end"]-1): # continue # mention = copy(mention) # mention["start"] = start_idx # mention["end"] = end_idx+1 # clause_parsed_result["mentions"].append(mention) # eventualities = self.eventuality_extractor.extract_from_parsed_result( # clause_parsed_result, output_format="Eventuality", in_order=True, use_lemma=use_lemma) # for eventuality in eventualities: # for k, v in eventuality.raw_sent_mapping.items(): # eventuality.raw_sent_mapping[k] = clause[v] # eventuality.eid = Eventuality.generate_eid(eventuality) # return eventualities # def _append_new_eventuaities_to_list(self, existed_eventualities, new_eventualities): # len_existed_eventualities = len(existed_eventualities) # for new_e in new_eventualities: # is_existed = False # for old_idx in range(len_existed_eventualities): # old_e = existed_eventualities[old_idx] # if old_e.eid == new_e.eid and old_e.raw_sent_mapping == new_e.raw_sent_mapping: # is_existed = True # break # if not is_existed: # existed_eventualities.append(new_e) # def extract_eventualities_from_parsed_result(self, parsed_result, # output_format="Eventuality", in_order=True, use_lemma=True, **kw): # if output_format not in ["Eventuality", "json"]: # raise NotImplementedError("Error: extract_from_parsed_result only supports Eventuality or json.") # if not isinstance(parsed_result, (list, tuple, dict)): # raise NotImplementedError # if isinstance(parsed_result, dict): # is_single_sent = True # parsed_result = [parsed_result] # else: # is_single_sent = False # syntax_tree_cache = kw.get("syntax_tree_cache", dict()) # para_eventualities = [list() for _ in range(len(parsed_result))] # para_clauses = self._extract_clauses(parsed_result, syntax_tree_cache) # for sent_parsed_result, sent_clauses, sent_eventualities in zip(parsed_result, para_clauses, para_eventualities): # for clause in sent_clauses: # sent_eventualities.extend(self._extract_eventualities_from_clause(sent_parsed_result, clause, use_lemma)) # if in_order: # if output_format == "json": # para_eventualities = [[eventuality.encode(encoding=None) for eventuality in sent_eventualities] \ # for sent_eventualities in para_eventualities] # if is_single_sent: # return para_eventualities[0] # else: # return para_eventualities # else: # eid2eventuality = dict() # for eventuality in chain.from_iterable(para_eventualities): # eid = eventuality.eid # if eid not in eid2eventuality: # eid2eventuality[eid] = deepcopy(eventuality) # else: # eid2eventuality[eid].update(eventuality) # if output_format == "Eventuality": # eventualities = sorted(eid2eventuality.values(), key=lambda e: e.eid) # elif output_format == "json": # eventualities = sorted([eventuality.encode(encoding=None) for eventuality in eid2eventuality.values()], key=lambda e: e["eid"]) # return eventualities # def extract_relations_from_parsed_result(self, parsed_result, para_eventualities, # output_format="Relation", # in_order=True, **kw): # if output_format not in ["Relation", "triplet"]: # raise NotImplementedError("Error: extract_relations_from_parsed_result only supports Relation or triplet.") # len_sentences = len(parsed_result) # if len_sentences == 0: # if in_order: # return [list()] # else: # return list() # similarity = kw.get("similarity", "simpson").lower() # threshold = kw.get("threshold", 0.8) # if threshold < 0.0 or threshold > 1.0: # raise ValueError("Error: threshold should be between 0.0 and 1.0.") # if similarity == "simpson": # similarity_func = self._match_argument_eventuality_by_Simpson # elif similarity == "jaccard": # similarity_func = self._match_argument_eventuality_by_Jaccard # elif similarity == "discourse": # similarity_func = self._match_argument_eventuality_by_dependencies # else: # raise NotImplementedError("Error: extract_from_parsed_result only supports Simpson or Jaccard.") # syntax_tree_cache = kw.get("syntax_tree_cache", dict()) # para_relations = [list() for _ in range(2*len_sentences-1)] # # replace sentences that contains no eventuality with empty sentences # filtered_parsed_result = list() # for sent_idx, (sent_parsed_result, sent_eventualities) in enumerate(zip(parsed_result, para_eventualities)): # if len(sent_eventualities) > 0: # relations_in_sent = para_relations[sent_idx] # for e1_idx in range(len(sent_eventualities)-1): # heid = sent_eventualities[e1_idx].eid # for e2_idx in range(e1_idx+1, len(sent_eventualities)): # teid = sent_eventualities[e2_idx].eid # relations_in_sent.append(Relation(heid, teid, ["Co_Occurrence"])) # filtered_parsed_result.append(sent_parsed_result) # else: # filtered_parsed_result.append(EMPTY_SENT_PARSED_RESULT) # empty sentence # # filtered_parsed_result.append(sent_parsed_result) # connectives = self.conn_extractor.extract(filtered_parsed_result, syntax_tree_cache) # SS_connectives, PS_connectives = self.argpos_classifier.classify(filtered_parsed_result, connectives, syntax_tree_cache) # SS_connectives = self.ss_extractor.extract(filtered_parsed_result, SS_connectives, syntax_tree_cache) # PS_connectives = self.ps_extractor.extract(filtered_parsed_result, PS_connectives, syntax_tree_cache) # connectives = self.explicit_classifier.classify(filtered_parsed_result, SS_connectives+PS_connectives, syntax_tree_cache) # connectives.sort(key=lambda x: (x["sent_idx"], x["indices"][0] if len(x["indices"]) > 0 else -1)) # for connective in connectives: # conn_indices = connective.get("indices", None) # arg1 = connective.get("arg1", None) # arg2 = connective.get("arg2", None) # sense = connective.get("sense", None) # if conn_indices and arg1 and arg2 and (sense and sense != "None"): # arg1_sent_idx = arg1["sent_idx"] # arg2_sent_idx = arg2["sent_idx"] # relation_list_idx = arg1_sent_idx if arg1_sent_idx == arg2_sent_idx else arg1_sent_idx + len_sentences # relations = para_relations[relation_list_idx] # sent_parsed_result1, sent_eventualities1 = parsed_result[arg1_sent_idx], para_eventualities[arg1_sent_idx] # sent_parsed_result2, sent_eventualities2 = parsed_result[arg2_sent_idx], para_eventualities[arg2_sent_idx] # arg1_eventualities = [e for e in sent_eventualities1 if \ # similarity_func(sent_parsed_result1, arg1, e, threshold=threshold, conn_indices=conn_indices)] # arg2_eventualities = [e for e in sent_eventualities2 if \ # similarity_func(sent_parsed_result2, arg2, e, threshold=threshold, conn_indices=conn_indices)] # cnt = 0.0 # if len(arg1_eventualities) > 0 and len(arg2_eventualities) > 0: # cnt = 1.0 / (len(arg1_eventualities) * len(arg2_eventualities)) # for e1 in arg1_eventualities: # heid = e1.eid # for e2 in arg2_eventualities: # teid = e2.eid # existed_relation = False # for relation in relations: # if relation.hid == heid and relation.tid == teid: # relation.update({sense: cnt}) # existed_relation = True # break # if not existed_relation: # relations.append(Relation(heid, teid, {sense: cnt})) # if in_order: # if output_format == "Relation": # return para_relations # elif output_format == "triplet": # return [sorted(chain.from_iterable([r.to_triplets() for r in relations])) \ # for relations in para_relations] # else: # if output_format == "Relation": # rid2relation = dict() # for relation in chain(*para_relations): # if relation.rid not in rid2relation: # rid2relation[relation.rid] = deepcopy(relation) # else: # rid2relation[relation.rid].update(relation) # return sorted(rid2relation.values(), key=lambda r: r.rid) # if output_format == "triplet": # return sorted([r.to_triplets() for relations in para_relations for r in relations]) # def extract_from_parsed_result(self, parsed_result, # eventuality_output_format="Eventuality", # relation_output_format="Relation", # in_order=True, **kw): # if eventuality_output_format not in ["Eventuality", "json"]: # raise NotImplementedError("Error: extract_eventualities only supports Eventuality or json.") # if relation_output_format not in ["Relation", "triplet"]: # raise NotImplementedError("Error: extract_relations only supports Relation or triplet.") # if not isinstance(parsed_result, (list, tuple, dict)): # raise NotImplementedError # if isinstance(parsed_result, dict): # is_single_sent = True # parsed_result = [parsed_result] # else: # is_single_sent = False # syntax_tree_cache = kw.get("syntax_tree_cache", dict()) # len_sentences = len(parsed_result) # para_eventualities = [list() for _ in range(len_sentences)] # para_relations = [list() for _ in range(2*len_sentences-1)] # connectives = self.conn_extractor.extract(parsed_result, syntax_tree_cache) # SS_connectives, PS_connectives = self.argpos_classifier.classify(parsed_result, connectives, syntax_tree_cache) # SS_connectives = self.ss_extractor.extract(parsed_result, SS_connectives, syntax_tree_cache) # PS_connectives = self.ps_extractor.extract(parsed_result, PS_connectives, syntax_tree_cache) # connectives = self.explicit_classifier.classify(parsed_result, SS_connectives+PS_connectives, syntax_tree_cache) # connectives.sort(key=lambda x: (x["sent_idx"], x["indices"][0] if len(x["indices"]) > 0 else -1)) # for connective in connectives: # conn_indices = connective.get("indices", None) # arg1 = connective.get("arg1", None) # arg2 = connective.get("arg2", None) # sense = connective.get("sense", None) # if conn_indices and arg1 and arg2: # arg1_sent_idx = arg1["sent_idx"] # arg2_sent_idx = arg2["sent_idx"] # senses = [] # if arg1_sent_idx == arg2_sent_idx: # senses.append("Co_Occurrence") # if sense and sense != "None": # senses.append(sense) # if len(senses) == 0: # continue # relation_list_idx = arg1_sent_idx if arg1_sent_idx == arg2_sent_idx else arg1_sent_idx + len_sentences # relations = para_relations[relation_list_idx] # sent_parsed_result1, sent_eventualities1 = parsed_result[arg1_sent_idx], para_eventualities[arg1_sent_idx] # sent_parsed_result2, sent_eventualities2 = parsed_result[arg2_sent_idx], para_eventualities[arg2_sent_idx] # arg1_eventualities = self._extract_eventualities_from_clause(sent_parsed_result1, arg1["indices"]) # arg2_eventualities = self._extract_eventualities_from_clause(sent_parsed_result2, arg2["indices"]) # self._append_new_eventuaities_to_list(sent_eventualities1, arg1_eventualities) # self._append_new_eventuaities_to_list(sent_eventualities2, arg2_eventualities) # cnt = 0.0 # if len(arg1_eventualities) > 0 and len(arg2_eventualities) > 0: # cnt = 1.0 / (len(arg1_eventualities) * len(arg2_eventualities)) # for e1 in arg1_eventualities: # heid = e1.eid # for e2 in arg2_eventualities: # teid = e2.eid # is_existed = False # for relation in relations: # if relation.hid == heid and relation.tid == teid: # relation.update({sense: cnt for sense in senses}) # is_existed = True # break # if not is_existed: # relations.append(Relation(heid, teid, {sense: cnt for sense in senses})) # if in_order: # if eventuality_output_format == "json": # para_eventualities = [[eventuality.encode(encoding=None) for eventuality in sent_eventualities] \ # for sent_eventualities in para_eventualities] # if relation_output_format == "triplet": # relations = [list(chain.from_iterable([relation.to_triplet() for relation in sent_relations])) \ # for sent_relations in para_relations] # if is_single_sent: # return para_eventualities[0], para_relations[0] # else: # return para_eventualities, para_relations # else: # eid2eventuality = dict() # for eventuality in chain.from_iterable(para_eventualities): # eid = eventuality.eid # if eid not in eid2eventuality: # eid2eventuality[eid] = deepcopy(eventuality) # else: # eid2eventuality[eid].update(eventuality) # if eventuality_output_format == "Eventuality": # eventualities = sorted(eid2eventuality.values(), key=lambda e: e.eid) # elif eventuality_output_format == "json": # eventualities = sorted([eventuality.encode(encoding=None) for eventuality in eid2eventuality.values()], key=lambda e: e["eid"]) # rid2relation = dict() # for relation in chain.from_iterable(para_relations): # if relation.rid not in rid2relation: # rid2relation[relation.rid] = deepcopy(relation) # else: # rid2relation[relation.rid].update(relation) # if relation_output_format == "Relation": # relations = sorted(rid2relation.values(), key=lambda r: r.rid) # elif relation_output_format == "triplet": # relations = sorted(chain.from_iterable([relation.to_triplets() for relation in rid2relation.values()])) # return eventualities, relations # def _extract_clauses(self, parsed_result, syntax_tree_cache): # para_arguments = [set() for _ in range(len(parsed_result))] # connectives = self.conn_extractor.extract(parsed_result, syntax_tree_cache) # para_connectives = [set() for _ in range(len(parsed_result))] # for connective in connectives: # sent_idx, indices = connective["sent_idx"], tuple(connective["indices"]) # para_connectives[sent_idx].add(indices) # for sent_idx, sent_parsed_result in enumerate(parsed_result): # sent_connectives = para_connectives[sent_idx] # sent_arguments = para_arguments[sent_idx] # if sent_idx in syntax_tree_cache: # syntax_tree = syntax_tree_cache[sent_idx] # else: # syntax_tree = syntax_tree_cache[sent_idx] = SyntaxTree(sent_parsed_result["parse"]) # # more but slower # # for indices in powerset(sent_connectives): # # indices = set(chain.from_iterable(indices)) # # sent_arguments.update(get_clauses(sent_parsed_result, syntax_tree, sep_indices=indices)) # sent_arguments.update(get_clauses(sent_parsed_result, syntax_tree, sep_indices=set(chain.from_iterable(sent_connectives)))) # return para_arguments