Source code for aser.extract.aser_extractor

from copy import copy, deepcopy
from itertools import chain
from .eventuality_extractor import SeedRuleEventualityExtractor, DiscourseEventualityExtractor
from .relation_extractor import SeedRuleRelationExtractor, DiscourseRelationExtractor
from .utils import parse_sentense_with_stanford, get_corenlp_client
from .utils import ANNOTATORS


[docs]class BaseASERExtractor(object):
    """ Base ASER Extractor to extract both eventualities and relations.
    It includes an instance of `BaseEventualityExtractor` and an instance of `BaseRelationExtractor`.

    """
    def __init__(self, corenlp_path="", corenlp_port=0, **kw):
        """

        :param corenlp_path: corenlp path, e.g., /home/xliucr/stanford-corenlp-3.9.2
        :type corenlp_path: str (default = "")
        :param corenlp_port: corenlp port, e.g., 9000
        :type corenlp_port: int (default = 0)
        :param kw: other parameters
        :type kw: Dict[str, object]
        """

        self.corenlp_path = corenlp_path
        self.corenlp_port = corenlp_port
        self.annotators = kw.get("annotators", list(ANNOTATORS))

        _, self.is_externel_corenlp = get_corenlp_client(corenlp_path=self.corenlp_path, corenlp_port=self.corenlp_port)

        self.eventuality_extractor = None
        self.relation_extractor = None

[docs]    def close(self):
        """ Close the extractor safely
        """

        if not self.is_externel_corenlp:
            corenlp_client, _ = get_corenlp_client(corenlp_path=self.corenlp_path, corenlp_port=self.corenlp_port)
            corenlp_client.stop()
        if self.eventuality_extractor:
            self.eventuality_extractor.close()
        if self.relation_extractor:
            self.relation_extractor.close()

    def __del__(self):
        self.close()

[docs]    def parse_text(self, text, annotators=None):
        """ Parse a raw text by corenlp

        :param text: a raw text
        :type text: str
        :param annotators: annotators for corenlp, please refer to https://stanfordnlp.github.io/CoreNLP/annotators.html
        :type annotators: Union[List, None] (default = None)
        :return: the parsed result
        :rtype: List[Dict[str, object]]

        .. highlight:: python
        .. code-block:: python

            Input:

            "My army will find your boat. In the meantime, I'm sure we could find you suitable accommodations."

            Output:

            [{'dependencies': [(1, 'nmod:poss', 0),
                               (3, 'nsubj', 1),
                               (3, 'aux', 2),
                               (3, 'dobj', 5),
                               (3, 'punct', 6),
                               (5, 'nmod:poss', 4)],
              'lemmas': ['my', 'army', 'will', 'find', 'you', 'boat', '.'],
              'mentions': [],
              'ners': ['O', 'O', 'O', 'O', 'O', 'O', 'O'],
              'parse': '(ROOT (S (NP (PRP$ My) (NN army)) (VP (MD will) (VP (VB find) (NP '
                       '(PRP$ your) (NN boat)))) (. .)))',
              'pos_tags': ['PRP$', 'NN', 'MD', 'VB', 'PRP$', 'NN', '.'],
              'text': 'My army will find your boat.',
              'tokens': ['My', 'army', 'will', 'find', 'your', 'boat', '.']},
             {'dependencies': [(2, 'case', 0),
                               (2, 'det', 1),
                               (6, 'nmod:in', 2),
                               (6, 'punct', 3),
                               (6, 'nsubj', 4),
                               (6, 'cop', 5),
                               (6, 'ccomp', 9),
                               (6, 'punct', 13),
                               (9, 'nsubj', 7),
                               (9, 'aux', 8),
                               (9, 'iobj', 10),
                               (9, 'dobj', 12),
                               (12, 'amod', 11)],
              'lemmas': ['in',
                         'the',
                         'meantime',
                         ',',
                         'I',
                         'be',
                         'sure',
                         'we',
                         'could',
                         'find',
                         'you',
                         'suitable',
                         'accommodation',
                         '.'],
              'mentions': [],
              'ners': ['O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O'],
              'parse': '(ROOT (S (PP (IN In) (NP (DT the) (NN meantime))) (, ,) (NP (PRP '
                       "I)) (VP (VBP 'm) (ADJP (JJ sure) (SBAR (S (NP (PRP we)) (VP (MD "
                       'could) (VP (VB find) (NP (PRP you)) (NP (JJ suitable) (NNS '
                       'accommodations)))))))) (. .)))',
              'pos_tags': ['IN',
                           'DT',
                           'NN',
                           ',',
                           'PRP',
                           'VBP',
                           'JJ',
                           'PRP',
                           'MD',
                           'VB',
                           'PRP',
                           'JJ',
                           'NNS',
                           '.'],
              'text': "In the meantime, I'm sure we could find you suitable "
                      'accommodations.',
              'tokens': ['In',
                         'the',
                         'meantime',
                         ',',
                         'I',
                         "'m",
                         'sure',
                         'we',
                         'could',
                         'find',
                         'you',
                         'suitable',
                         'accommodations',
                         '.']}]
        """
        if annotators is None:
            annotators = self.annotators

        corenlp_client, _ = get_corenlp_client(
            corenlp_path=self.corenlp_path, corenlp_port=self.corenlp_port, annotators=annotators
        )
        parsed_result = parse_sentense_with_stanford(text, corenlp_client, self.annotators)
        return parsed_result

[docs]    def extract_eventualities_from_parsed_result(self, parsed_result, output_format="Eventuality", in_order=True, use_lemma=True, **kw):
        """ Extract eventualities from the parsed result

        :param parsed_result: the parsed result returned by corenlp
        :type parsed_result: List[Dict[str, object]]
        :param output_format: which format to return, "Eventuality" or "json"
        :type output_format: str (default = "Eventuality")
        :param in_order: whether the returned order follows the input token order
        :type in_order: bool (default = True)
        :param use_lemma: whether the returned eventuality uses lemma
        :type use_lemma: bool (default = True)
        :param kw: other parameters
        :type kw: Dict[str, object]
        :return: the extracted eventualities
        :rtype: Union[List[List[aser.eventuality.Eventuality]], List[List[Dict[str, object]]], List[aser.eventuality.Eventuality], List[Dict[str, object]]]

        .. highlight:: python
        .. code-block:: python

            Input:

            [{'dependencies': [(1, 'nmod:poss', 0),
                               (3, 'nsubj', 1),
                               (3, 'aux', 2),
                               (3, 'dobj', 5),
                               (3, 'punct', 6),
                               (5, 'nmod:poss', 4)],
              'lemmas': ['my', 'army', 'will', 'find', 'you', 'boat', '.'],
              'mentions': [],
              'ners': ['O', 'O', 'O', 'O', 'O', 'O', 'O'],
              'parse': '(ROOT (S (NP (PRP$ My) (NN army)) (VP (MD will) (VP (VB find) (NP '
                       '(PRP$ your) (NN boat)))) (. .)))',
              'pos_tags': ['PRP$', 'NN', 'MD', 'VB', 'PRP$', 'NN', '.'],
              'text': 'My army will find your boat.',
              'tokens': ['My', 'army', 'will', 'find', 'your', 'boat', '.']},
             {'dependencies': [(2, 'case', 0),
                               (2, 'det', 1),
                               (6, 'nmod:in', 2),
                               (6, 'punct', 3),
                               (6, 'nsubj', 4),
                               (6, 'cop', 5),
                               (6, 'ccomp', 9),
                               (6, 'punct', 13),
                               (9, 'nsubj', 7),
                               (9, 'aux', 8),
                               (9, 'iobj', 10),
                               (9, 'dobj', 12),
                               (12, 'amod', 11)],
              'lemmas': ['in',
                         'the',
                         'meantime',
                         ',',
                         'I',
                         'be',
                         'sure',
                         'we',
                         'could',
                         'find',
                         'you',
                         'suitable',
                         'accommodation',
                         '.'],
              'mentions': [],
              'ners': ['O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O'],
              'parse': '(ROOT (S (PP (IN In) (NP (DT the) (NN meantime))) (, ,) (NP (PRP '
                       "I)) (VP (VBP 'm) (ADJP (JJ sure) (SBAR (S (NP (PRP we)) (VP (MD "
                       'could) (VP (VB find) (NP (PRP you)) (NP (JJ suitable) (NNS '
                       'accommodations)))))))) (. .)))',
              'pos_tags': ['IN',
                           'DT',
                           'NN',
                           ',',
                           'PRP',
                           'VBP',
                           'JJ',
                           'PRP',
                           'MD',
                           'VB',
                           'PRP',
                           'JJ',
                           'NNS',
                           '.'],
              'text': "In the meantime, I'm sure we could find you suitable "
                      'accommodations.',
              'tokens': ['In',
                         'the',
                         'meantime',
                         ',',
                         'I',
                         "'m",
                         'sure',
                         'we',
                         'could',
                         'find',
                         'you',
                         'suitable',
                         'accommodations',
                         '.']}]

            Output:

            [[my army will find you boat],
             [i be sure, we could find you suitable accommodation]]

        """

        if output_format not in ["Eventuality", "json"]:
            raise ValueError(
                "Error: extract_eventualities_from_parsed_result only supports Eventuality or json."
            )

        return self.eventuality_extractor.extract_from_parsed_result(
            parsed_result, output_format=output_format, in_order=in_order, use_lemma=use_lemma, **kw
        )

[docs]    def extract_eventualities_from_text(self, text, output_format="Eventuality", in_order=True, use_lemma=True, annotators=None, **kw):
        """ Extract eventualities from a raw text

        :param text: a raw text
        :type text: str
        :param output_format: which format to return, "Eventuality" or "json"
        :type output_format: str (default = "Eventuality")
        :param in_order: whether the returned order follows the input token order
        :type in_order: bool (default = True)
        :param use_lemma: whether the returned eventuality uses lemma
        :type use_lemma: bool (default = True)
        :param annotators: annotators for corenlp, please refer to https://stanfordnlp.github.io/CoreNLP/annotators.html
        :type annotators: Union[List, None] (default = None)
        :param kw: other parameters
        :type kw: Dict[str, object]
        :return: the extracted eventualities
        :rtype: Union[List[List[aser.eventuality.Eventuality]], List[List[Dict[str, object]]], List[aser.eventuality.Eventuality], List[Dict[str, object]]]

        .. highlight:: python
        .. code-block:: python

            Input:

            "My army will find your boat. In the meantime, I'm sure we could find you suitable accommodations."

            Output:

            [[my army will find you boat],
             [i be sure, we could find you suitable accommodation]]
        """

        if output_format not in ["Eventuality", "json"]:
            raise NotImplementedError("Error: extract_eventualities_from_text only supports Eventuality or json.")

        parsed_result = self.parse_text(text, annotators=annotators)
        return self.extract_eventualities_from_parsed_result(
            parsed_result, output_format=output_format, in_order=in_order, use_lemma=use_lemma, **kw
        )

[docs]    def extract_relations_from_parsed_result(
        self, parsed_result, para_eventualities, output_format="Relation", in_order=True, **kw
    ):
        """ Extract relations from a parsed result (of a paragraph) and extracted eventualities

        :param parsed_result: the parsed result returned by corenlp
        :type parsed_result: List[Dict[str, object]]
        :param para_eventualities: eventualities in the paragraph
        :type para_eventualities: List[aser.eventuality.Eventuality]
        :param output_format: which format to return, "Relation" or "triplet"
        :type output_format: str (default = "Relation")
        :param in_order: whether the returned order follows the input token order
        :type in_order: bool (default = True)
        :param kw: other parameters
        :type kw: Dict[str, object]
        :return: the extracted relations
        :rtype: Union[List[List[aser.relation.Relation]], List[List[Dict[str, object]]], List[aser.relation.Relation], List[Dict[str, object]]]

        .. highlight:: python
        .. code-block:: python

            Input:

                [{'dependencies': [(1, 'nmod:poss', 0),
                                   (3, 'nsubj', 1),
                                   (3, 'aux', 2),
                                   (3, 'dobj', 5),
                                   (3, 'punct', 6),
                                   (5, 'nmod:poss', 4)],
                  'lemmas': ['my', 'army', 'will', 'find', 'you', 'boat', '.'],
                  'mentions': [],
                  'ners': ['O', 'O', 'O', 'O', 'O', 'O', 'O'],
                  'parse': '(ROOT (S (NP (PRP$ My) (NN army)) (VP (MD will) (VP (VB find) (NP '
                           '(PRP$ your) (NN boat)))) (. .)))',
                  'pos_tags': ['PRP$', 'NN', 'MD', 'VB', 'PRP$', 'NN', '.'],
                  'text': 'My army will find your boat.',
                  'tokens': ['My', 'army', 'will', 'find', 'your', 'boat', '.']},
                 {'dependencies': [(2, 'case', 0),
                                   (2, 'det', 1),
                                   (6, 'nmod:in', 2),
                                   (6, 'punct', 3),
                                   (6, 'nsubj', 4),
                                   (6, 'cop', 5),
                                   (6, 'ccomp', 9),
                                   (6, 'punct', 13),
                                   (9, 'nsubj', 7),
                                   (9, 'aux', 8),
                                   (9, 'iobj', 10),
                                   (9, 'dobj', 12),
                                   (12, 'amod', 11)],
                  'lemmas': ['in',
                             'the',
                             'meantime',
                             ',',
                             'I',
                             'be',
                             'sure',
                             'we',
                             'could',
                             'find',
                             'you',
                             'suitable',
                             'accommodation',
                             '.'],
                  'mentions': [],
                  'ners': ['O',
                           'O',
                           'O',
                           'O',
                           'O',
                           'O',
                           'O',
                           'O',
                           'O',
                           'O',
                           'O',
                           'O',
                           'O',
                           'O'],
                  'parse': '(ROOT (S (PP (IN In) (NP (DT the) (NN meantime))) (, ,) (NP (PRP '
                           "I)) (VP (VBP 'm) (ADJP (JJ sure) (SBAR (S (NP (PRP we)) (VP (MD "
                           'could) (VP (VB find) (NP (PRP you)) (NP (JJ suitable) (NNS '
                           'accommodations)))))))) (. .)))',
                  'pos_tags': ['IN',
                               'DT',
                               'NN',
                               ',',
                               'PRP',
                               'VBP',
                               'JJ',
                               'PRP',
                               'MD',
                               'VB',
                               'PRP',
                               'JJ',
                               'NNS',
                               '.'],
                  'text': "In the meantime, I'm sure we could find you suitable "
                          'accommodations.',
                  'tokens': ['In',
                             'the',
                             'meantime',
                             ',',
                             'I',
                             "'m",
                             'sure',
                             'we',
                             'could',
                             'find',
                             'you',
                             'suitable',
                             'accommodations',
                             '.']}],
                [[my army will find you boat],
                 [i be sure, we could find you suitable accommodation]]

                Output:

                [[],
                 [(7d9ea9023b66a0ebc167f0dbb6ea8cd75d7b46f9, 25edad6781577dcb3ba715c8230416fb0d4c45c4, {'Co_Occurrence': 1.0})],
                 [(8540897b645962964fd644242d4cc0032f024e86, 25edad6781577dcb3ba715c8230416fb0d4c45c4, {'Synchronous': 1.0})]]
        """

        if output_format not in ["Relation", "triplet"]:
            raise NotImplementedError("Error: extract_relations_from_parsed_result only supports Relation or triplet.")

        return self.relation_extractor.extract_from_parsed_result(
            parsed_result, para_eventualities, output_format=output_format, in_order=in_order, **kw
        )

[docs]    def extract_relations_from_text(self, text, output_format="Relation", in_order=True, annotators=None, **kw):
        """ Extract relations from a raw text and extracted eventualities

        :param text: a raw text
        :type text: str
        :param output_format: which format to return, "Relation" or "triplet"
        :type output_format: str (default = "Relation")
        :param in_order: whether the returned order follows the input token order
        :type in_order: bool (default = True)
        :param annotators: annotators for corenlp, please refer to https://stanfordnlp.github.io/CoreNLP/annotators.html
        :type annotators: Union[List, None] (default = None)
        :param kw: other parameters
        :type kw: Dict[str, object]
        :return: the extracted relations
        :rtype: Union[List[List[aser.relation.Relation]], List[List[Dict[str, object]]], List[aser.relation.Relation], List[Dict[str, object]]]

        .. highlight:: python
        .. code-block:: python

            Input:

            "My army will find your boat. In the meantime, I'm sure we could find you suitable accommodations."

            Output:

            [[],
             [(7d9ea9023b66a0ebc167f0dbb6ea8cd75d7b46f9, 25edad6781577dcb3ba715c8230416fb0d4c45c4, {'Co_Occurrence': 1.0})],
             [(8540897b645962964fd644242d4cc0032f024e86, 25edad6781577dcb3ba715c8230416fb0d4c45c4, {'Synchronous': 1.0})]]
        """

        if output_format not in ["Relation", "triplet"]:
            raise NotImplementedError("Error: extract_relations_from_text only supports Relation or triplet.")

        parsed_result = self.parse_text(text, annotators=annotators)
        para_eventualities = self.extract_eventualities_from_parsed_result(parsed_result)
        return self.extract_relations_from_parsed_result(
            parsed_result, para_eventualities, output_format=output_format, in_order=in_order, **kw
        )

[docs]    def extract_from_parsed_result(
        self,
        parsed_result,
        eventuality_output_format="Eventuality",
        relation_output_format="Relation",
        in_order=True,
        use_lemma=True,
        **kw
    ):
        """ Extract both eventualities and relations from a parsed result

        :param parsed_result: the parsed result returned by corenlp
        :type parsed_result: List[Dict[str, object]]
        :param eventuality_output_format: which format to return eventualities, "Eventuality" or "json"
        :type eventuality_output_format: str (default = "Eventuality")
        :param relation_output_format: which format to return relations, "Relation" or "triplet"
        :type relation_output_format: str (default = "Relation")
        :param in_order: whether the returned order follows the input token order
        :type in_order: bool (default = True)
        :param use_lemma: whether the returned eventuality uses lemma
        :type use_lemma: bool (default = True)
        :param kw: other parameters
        :type kw: Dict[str, object]
        :return: the extracted eventualities and relations
        :rtype: Tuple[Union[List[List[aser.eventuality.Eventuality]], List[List[Dict[str, object]]], List[aser.eventuality.Eventuality], List[Dict[str, object]]], Union[List[List[aser.relation.Relation]], List[List[Dict[str, object]]], List[aser.relation.Relation], List[Dict[str, object]]]]

        .. highlight:: python
        .. code-block:: python

            Input:

            [{'dependencies': [(1, 'nmod:poss', 0),
                               (3, 'nsubj', 1),
                               (3, 'aux', 2),
                               (3, 'dobj', 5),
                               (3, 'punct', 6),
                               (5, 'nmod:poss', 4)],
              'lemmas': ['my', 'army', 'will', 'find', 'you', 'boat', '.'],
              'mentions': [],
              'ners': ['O', 'O', 'O', 'O', 'O', 'O', 'O'],
              'parse': '(ROOT (S (NP (PRP$ My) (NN army)) (VP (MD will) (VP (VB find) (NP '
                       '(PRP$ your) (NN boat)))) (. .)))',
              'pos_tags': ['PRP$', 'NN', 'MD', 'VB', 'PRP$', 'NN', '.'],
              'text': 'My army will find your boat.',
              'tokens': ['My', 'army', 'will', 'find', 'your', 'boat', '.']},
             {'dependencies': [(2, 'case', 0),
                               (2, 'det', 1),
                               (6, 'nmod:in', 2),
                               (6, 'punct', 3),
                               (6, 'nsubj', 4),
                               (6, 'cop', 5),
                               (6, 'ccomp', 9),
                               (6, 'punct', 13),
                               (9, 'nsubj', 7),
                               (9, 'aux', 8),
                               (9, 'iobj', 10),
                               (9, 'dobj', 12),
                               (12, 'amod', 11)],
              'lemmas': ['in',
                         'the',
                         'meantime',
                         ',',
                         'I',
                         'be',
                         'sure',
                         'we',
                         'could',
                         'find',
                         'you',
                         'suitable',
                         'accommodation',
                         '.'],
              'mentions': [],
              'ners': ['O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O',
                       'O'],
              'parse': '(ROOT (S (PP (IN In) (NP (DT the) (NN meantime))) (, ,) (NP (PRP '
                       "I)) (VP (VBP 'm) (ADJP (JJ sure) (SBAR (S (NP (PRP we)) (VP (MD "
                       'could) (VP (VB find) (NP (PRP you)) (NP (JJ suitable) (NNS '
                       'accommodations)))))))) (. .)))',
              'pos_tags': ['IN',
                           'DT',
                           'NN',
                           ',',
                           'PRP',
                           'VBP',
                           'JJ',
                           'PRP',
                           'MD',
                           'VB',
                           'PRP',
                           'JJ',
                           'NNS',
                           '.'],
              'text': "In the meantime, I'm sure we could find you suitable "
                      'accommodations.',
              'tokens': ['In',
                         'the',
                         'meantime',
                         ',',
                         'I',
                         "'m",
                         'sure',
                         'we',
                         'could',
                         'find',
                         'you',
                         'suitable',
                         'accommodations',
                         '.']}],
            [[my army will find you boat],
             [i be sure, we could find you suitable accommodation]]

            Output:

            ([[my army will find you boat],
              [i be sure, we could find you suitable accommodation]],
             [[],
              [(7d9ea9023b66a0ebc167f0dbb6ea8cd75d7b46f9, 25edad6781577dcb3ba715c8230416fb0d4c45c4, {'Co_Occurrence': 1.0})],
              [(8540897b645962964fd644242d4cc0032f024e86, 25edad6781577dcb3ba715c8230416fb0d4c45c4, {'Synchronous': 1.0})]])
        """

        if eventuality_output_format not in ["Eventuality", "json"]:
            raise NotImplementedError("Error: extract_eventualities only supports Eventuality or json.")
        if relation_output_format not in ["Relation", "triplet"]:
            raise NotImplementedError("Error: extract_relations only supports Relation or triplet.")

        if not isinstance(parsed_result, (list, tuple, dict)):
            raise NotImplementedError
        if isinstance(parsed_result, dict):
            is_single_sent = True
            parsed_result = [parsed_result]
        else:
            is_single_sent = False

        para_eventualities = self.extract_eventualities_from_parsed_result(
            parsed_result, output_format="Eventuality", in_order=True, use_lemma=use_lemma, **kw
        )
        para_relations = self.extract_relations_from_parsed_result(
            parsed_result, para_eventualities, output_format="Relation", in_order=True, **kw
        )

        if in_order:
            if eventuality_output_format == "json":
                para_eventualities = [[eventuality.encode(encoding=None) for eventuality in sent_eventualities] \
                                      for sent_eventualities in para_eventualities]
            if relation_output_format == "triplet":
                para_relations = [list(chain.from_iterable([relation.to_triplet() for relation in sent_relations])) \
                             for sent_relations in para_relations]
            if is_single_sent:
                return para_eventualities[0], para_relations[0]
            else:
                return para_eventualities, para_relations
        else:
            eid2eventuality = dict()
            for eventuality in chain.from_iterable(para_eventualities):
                eid = eventuality.eid
                if eid not in eid2eventuality:
                    eid2eventuality[eid] = deepcopy(eventuality)
                else:
                    eid2eventuality[eid].update(eventuality)
            if eventuality_output_format == "Eventuality":
                eventualities = sorted(eid2eventuality.values(), key=lambda e: e.eid)
            elif eventuality_output_format == "json":
                eventualities = sorted(
                    [eventuality.encode(encoding=None) for eventuality in eid2eventuality.values()],
                    key=lambda e: e["eid"]
                )

            rid2relation = dict()
            for relation in chain.from_iterable(para_relations):
                if relation.rid not in rid2relation:
                    rid2relation[relation.rid] = deepcopy(relation)
                else:
                    rid2relation[relation.rid].update(relation)
            if relation_output_format == "Relation":
                para_relations = sorted(rid2relation.values(), key=lambda r: r.rid)
            elif relation_output_format == "triplet":
                para_relations = sorted(chain.from_iterable([relation.to_triplets() for relation in rid2relation.values()]))
            return eventualities, para_relations

[docs]    def extract_from_text(
        self,
        text,
        eventuality_output_format="Eventuality",
        relation_output_format="Relation",
        in_order=True,
        use_lemma=True,
        annotators=None,
        **kw
    ):
        """ Extract both eventualities and relations from a raw text

        :param text: a raw text
        :type text: str
        :param eventuality_output_format: which format to return eventualities, "Eventuality" or "json"
        :type eventuality_output_format: str (default = "Eventuality")
        :param relation_output_format: which format to return relations, "Relation" or "triplet"
        :type relation_output_format: str (default = "Relation")
        :param in_order: whether the returned order follows the input token order
        :type in_order: bool (default = True)
        :param use_lemma: whether the returned eventuality uses lemma
        :type use_lemma: bool (default = True)
        :param annotators: annotators for corenlp, please refer to https://stanfordnlp.github.io/CoreNLP/annotators.html
        :type annotators: Union[List, None] (default = None)
        :param kw: other parameters
        :type kw: Dict[str, object]
        :return: the extracted eventualities and relations
        :rtype: :rtype: Tuple[Union[List[List[aser.eventuality.Eventuality]], List[List[Dict[str, object]]], List[aser.eventuality.Eventuality], List[Dict[str, object]]], Union[List[List[aser.relation.Relation]], List[List[Dict[str, object]]], List[aser.relation.Relation], List[Dict[str, object]]]]

        .. highlight:: python
        .. code-block:: python

            Input:

            "My army will find your boat. In the meantime, I'm sure we could find you suitable accommodations."

            Output:

            ([[my army will find you boat],
              [i be sure, we could find you suitable accommodation]],
             [[],
              [(7d9ea9023b66a0ebc167f0dbb6ea8cd75d7b46f9, 25edad6781577dcb3ba715c8230416fb0d4c45c4, {'Co_Occurrence': 1.0})],
              [(8540897b645962964fd644242d4cc0032f024e86, 25edad6781577dcb3ba715c8230416fb0d4c45c4, {'Synchronous': 1.0})]])
        """
        if eventuality_output_format not in ["Eventuality", "json"]:
            raise NotImplementedError("Error: extract_eventualities only supports Eventuality or json.")
        if relation_output_format not in ["Relation", "triplet"]:
            raise NotImplementedError("Error: extract_relations only supports Relation or triplet.")

        parsed_result = self.parse_text(text, annotators=annotators)
        return self.extract_from_parsed_result(
            parsed_result,
            eventuality_output_format=eventuality_output_format,
            relation_output_format=relation_output_format,
            in_order=in_order,
            use_lemma=use_lemma,
            **kw
        )


[docs]class SeedRuleASERExtractor(BaseASERExtractor):
    """ ASER Extractor based on rules to extract both eventualities and relations (for ASER v1.0)

    """
    def __init__(self, corenlp_path="", corenlp_port=0, **kw):
        if "annotators" not in kw:
            kw["annotators"] = list(ANNOTATORS)
            if "parse" in kw["annotators"]:
                kw["annotators"].remove("parse")
            if "depparse" not in kw["annotators"]:
                kw["annotators"].append("depparse")
        super().__init__(corenlp_path, corenlp_port, **kw)
        from .rule import CLAUSE_WORDS
        self.eventuality_extractor = SeedRuleEventualityExtractor(
            corenlp_path=self.corenlp_path, corenlp_port=self.corenlp_port, skip_words=CLAUSE_WORDS, **kw
        )
        self.relation_extractor = SeedRuleRelationExtractor(**kw)


[docs]class DiscourseASERExtractor(BaseASERExtractor):
    """ ASER Extractor based on discourse parsing to extract both eventualities and relations (for ASER v2.0)

    """
    def __init__(self, corenlp_path="", corenlp_port=0, **kw):
        if "annotators" not in kw:
            kw["annotators"] = list(ANNOTATORS)
            if "depparse" in kw["annotators"]:
                kw["annotators"].remove("depparse")
            if "parse" not in kw["annotators"]:
                kw["annotators"].append("parse")
        super().__init__(corenlp_path, corenlp_port, **kw)
        self.eventuality_extractor = DiscourseEventualityExtractor(
            corenlp_path=self.corenlp_path, corenlp_port=self.corenlp_port, **kw
        )
        self.relation_extractor = DiscourseRelationExtractor(**kw)

[docs]    def extract_from_parsed_result(
        self,
        parsed_result,
        eventuality_output_format="Eventuality",
        relation_output_format="Relation",
        in_order=True,
        use_lemma=True,
        **kw
    ):
        """ Extract both eventualities and relations from a parsed result

        :param parsed_result: the parsed result returned by corenlp
        :type parsed_result: List[Dict[str, object]]
        :param eventuality_output_format: which format to return eventualities, "Eventuality" or "json"
        :type eventuality_output_format: str (default = "Eventuality")
        :param relation_output_format: which format to return relations, "Relation" or "triplet"
        :type relation_output_format: str (default = "Relation")
        :param in_order: whether the returned order follows the input token order
        :type in_order: bool (default = True)
        :param use_lemma: whether the returned eventuality uses lemma
        :type use_lemma: bool (default = True)
        :param kw: other parameters (e.g., syntax_tree_cache)
        :type kw: Dict[str, object]
        :return: the extracted eventualities and relations
        :rtype: :rtype: Tuple[Union[List[List[aser.eventuality.Eventuality]], List[List[Dict[str, object]]], List[aser.eventuality.Eventuality], List[Dict[str, object]]], Union[List[List[aser.relation.Relation]], List[List[Dict[str, object]]], List[aser.relation.Relation], List[Dict[str, object]]]]
        """

        if "syntax_tree_cache" not in kw:
            kw["syntax_tree_cache"] = dict()
        return super().extract_from_parsed_result(
            parsed_result,
            eventuality_output_format=eventuality_output_format,
            relation_output_format=relation_output_format,
            in_order=in_order,
            use_lemma=use_lemma,
            **kw
        )

# The following extractor can cover more eventualities but the semantic meaning may be incomplete.
# class DiscourseASERExtractor(BaseASERExtractor):
#     def __init__(self, corenlp_path="", corenlp_port=0, **kw):
#         super().__init__(corenlp_path, corenlp_port, **kw)
#         self.eventuality_extractor = SeedRuleEventualityExtractor(**kw)
#         self.conn_extractor = ConnectiveExtractor(**kw)
#         self.argpos_classifier = ArgumentPositionClassifier(**kw)
#         self.ss_extractor = SSArgumentExtractor(**kw)
#         self.ps_extractor = PSArgumentExtractor(**kw)
#         self.explicit_classifier = ExplicitSenseClassifier(**kw)

#     def _extract_eventualities_from_clause(self, sent_parsed_result, clause, use_lemma):
#         len_clause = len(clause)
#         idx_mapping = {j: i for i, j in enumerate(clause)}
#         indices_set = set(clause)
#         clause_parsed_result = {
#             "text": "",
#             "dependencies": [(idx_mapping[dep[0]], dep[1], idx_mapping[dep[2]]) for dep in sent_parsed_result["dependencies"] \
#                 if dep[0] in indices_set and dep[2] in indices_set],
#             "tokens": [sent_parsed_result["tokens"][idx] for idx in clause],
#             "pos_tags": [sent_parsed_result["pos_tags"][idx] for idx in clause],
#             "lemmas": [sent_parsed_result["lemmas"][idx] for idx in clause]}
#         if "ners" in sent_parsed_result:
#             clause_parsed_result["ners"] = [sent_parsed_result["ners"][idx] for idx in clause]
#         if "mentions" in sent_parsed_result:
#             clause_parsed_result["mentions"] = list()
#             for mention in sent_parsed_result["mentions"]:
#                 start_idx = bisect.bisect_left(clause, mention["start"])
#                 if not (start_idx < len_clause and clause[start_idx] == mention["start"]):
#                     continue
#                 end_idx = bisect.bisect_left(clause, mention["end"]-1)
#                 if not (end_idx < len_clause and clause[end_idx] == mention["end"]-1):
#                     continue
#                 mention = copy(mention)
#                 mention["start"] = start_idx
#                 mention["end"] = end_idx+1
#                 clause_parsed_result["mentions"].append(mention)
#         eventualities = self.eventuality_extractor.extract_from_parsed_result(
#             clause_parsed_result, output_format="Eventuality", in_order=True, use_lemma=use_lemma)
#         for eventuality in eventualities:
#             for k, v in eventuality.raw_sent_mapping.items():
#                 eventuality.raw_sent_mapping[k] = clause[v]
#             eventuality.eid = Eventuality.generate_eid(eventuality)
#         return eventualities

#     def _append_new_eventuaities_to_list(self, existed_eventualities, new_eventualities):
#         len_existed_eventualities = len(existed_eventualities)
#         for new_e in new_eventualities:
#             is_existed = False
#             for old_idx in range(len_existed_eventualities):
#                 old_e = existed_eventualities[old_idx]
#                 if old_e.eid == new_e.eid and old_e.raw_sent_mapping == new_e.raw_sent_mapping:
#                     is_existed = True
#                     break
#             if not is_existed:
#                 existed_eventualities.append(new_e)

#     def extract_eventualities_from_parsed_result(self, parsed_result,
#                                                  output_format="Eventuality", in_order=True, use_lemma=True, **kw):
#         if output_format not in ["Eventuality", "json"]:
#             raise NotImplementedError("Error: extract_from_parsed_result only supports Eventuality or json.")

#         if not isinstance(parsed_result, (list, tuple, dict)):
#             raise NotImplementedError
#         if isinstance(parsed_result, dict):
#             is_single_sent = True
#             parsed_result = [parsed_result]
#         else:
#             is_single_sent = False

#         syntax_tree_cache = kw.get("syntax_tree_cache", dict())

#         para_eventualities = [list() for _ in range(len(parsed_result))]
#         para_clauses = self._extract_clauses(parsed_result, syntax_tree_cache)
#         for sent_parsed_result, sent_clauses, sent_eventualities in zip(parsed_result, para_clauses, para_eventualities):
#             for clause in sent_clauses:
#                 sent_eventualities.extend(self._extract_eventualities_from_clause(sent_parsed_result, clause, use_lemma))

#         if in_order:
#             if output_format == "json":
#                 para_eventualities = [[eventuality.encode(encoding=None) for eventuality in sent_eventualities] \
#                     for sent_eventualities in para_eventualities]
#             if is_single_sent:
#                 return para_eventualities[0]
#             else:
#                 return para_eventualities
#         else:
#             eid2eventuality = dict()
#             for eventuality in chain.from_iterable(para_eventualities):
#                 eid = eventuality.eid
#                 if eid not in eid2eventuality:
#                     eid2eventuality[eid] = deepcopy(eventuality)
#                 else:
#                     eid2eventuality[eid].update(eventuality)
#             if output_format == "Eventuality":
#                 eventualities = sorted(eid2eventuality.values(), key=lambda e: e.eid)
#             elif output_format == "json":
#                 eventualities = sorted([eventuality.encode(encoding=None) for eventuality in eid2eventuality.values()], key=lambda e: e["eid"])
#             return eventualities

#     def extract_relations_from_parsed_result(self, parsed_result, para_eventualities,
#                                              output_format="Relation",
#                                              in_order=True, **kw):
#         if output_format not in ["Relation", "triplet"]:
#             raise NotImplementedError("Error: extract_relations_from_parsed_result only supports Relation or triplet.")

#         len_sentences = len(parsed_result)
#         if len_sentences == 0:
#             if in_order:
#                 return [list()]
#             else:
#                 return list()

#         similarity = kw.get("similarity", "simpson").lower()
#         threshold = kw.get("threshold", 0.8)
#         if threshold < 0.0 or threshold > 1.0:
#             raise ValueError("Error: threshold should be between 0.0 and 1.0.")
#         if similarity == "simpson":
#             similarity_func = self._match_argument_eventuality_by_Simpson
#         elif similarity == "jaccard":
#             similarity_func = self._match_argument_eventuality_by_Jaccard
#         elif similarity == "discourse":
#             similarity_func = self._match_argument_eventuality_by_dependencies
#         else:
#             raise NotImplementedError("Error: extract_from_parsed_result only supports Simpson or Jaccard.")

#         syntax_tree_cache = kw.get("syntax_tree_cache", dict())

#         para_relations = [list() for _ in range(2*len_sentences-1)]

#         # replace sentences that contains no eventuality with empty sentences
#         filtered_parsed_result = list()
#         for sent_idx, (sent_parsed_result, sent_eventualities) in enumerate(zip(parsed_result, para_eventualities)):
#             if len(sent_eventualities) > 0:
#                 relations_in_sent = para_relations[sent_idx]
#                 for e1_idx in range(len(sent_eventualities)-1):
#                     heid = sent_eventualities[e1_idx].eid
#                     for e2_idx in range(e1_idx+1, len(sent_eventualities)):
#                         teid = sent_eventualities[e2_idx].eid
#                         relations_in_sent.append(Relation(heid, teid, ["Co_Occurrence"]))
#                 filtered_parsed_result.append(sent_parsed_result)
#             else:
#                 filtered_parsed_result.append(EMPTY_SENT_PARSED_RESULT) # empty sentence
#                 # filtered_parsed_result.append(sent_parsed_result)

#         connectives = self.conn_extractor.extract(filtered_parsed_result, syntax_tree_cache)
#         SS_connectives, PS_connectives = self.argpos_classifier.classify(filtered_parsed_result, connectives, syntax_tree_cache)
#         SS_connectives = self.ss_extractor.extract(filtered_parsed_result, SS_connectives, syntax_tree_cache)
#         PS_connectives = self.ps_extractor.extract(filtered_parsed_result, PS_connectives, syntax_tree_cache)
#         connectives = self.explicit_classifier.classify(filtered_parsed_result, SS_connectives+PS_connectives, syntax_tree_cache)
#         connectives.sort(key=lambda x: (x["sent_idx"], x["indices"][0] if len(x["indices"]) > 0 else -1))

#         for connective in connectives:
#             conn_indices = connective.get("indices", None)
#             arg1 = connective.get("arg1", None)
#             arg2 = connective.get("arg2", None)
#             sense = connective.get("sense", None)
#             if conn_indices and arg1 and arg2 and (sense and sense != "None"):
#                 arg1_sent_idx = arg1["sent_idx"]
#                 arg2_sent_idx = arg2["sent_idx"]
#                 relation_list_idx = arg1_sent_idx if arg1_sent_idx == arg2_sent_idx else arg1_sent_idx + len_sentences
#                 relations = para_relations[relation_list_idx]
#                 sent_parsed_result1, sent_eventualities1 = parsed_result[arg1_sent_idx], para_eventualities[arg1_sent_idx]
#                 sent_parsed_result2, sent_eventualities2 = parsed_result[arg2_sent_idx], para_eventualities[arg2_sent_idx]
#                 arg1_eventualities = [e for e in sent_eventualities1 if \
#                     similarity_func(sent_parsed_result1, arg1, e, threshold=threshold, conn_indices=conn_indices)]
#                 arg2_eventualities = [e for e in sent_eventualities2 if \
#                     similarity_func(sent_parsed_result2, arg2, e, threshold=threshold, conn_indices=conn_indices)]
#                 cnt = 0.0
#                 if len(arg1_eventualities) > 0 and len(arg2_eventualities) > 0:
#                     cnt = 1.0 / (len(arg1_eventualities) * len(arg2_eventualities))
#                 for e1 in arg1_eventualities:
#                     heid = e1.eid
#                     for e2 in arg2_eventualities:
#                         teid = e2.eid
#                         existed_relation = False
#                         for relation in relations:
#                             if relation.hid == heid and relation.tid == teid:
#                                 relation.update({sense: cnt})
#                                 existed_relation = True
#                                 break
#                         if not existed_relation:
#                             relations.append(Relation(heid, teid, {sense: cnt}))

#         if in_order:
#             if output_format == "Relation":
#                 return para_relations
#             elif output_format == "triplet":
#                 return [sorted(chain.from_iterable([r.to_triplets() for r in relations])) \
#                     for relations in para_relations]
#         else:
#             if output_format == "Relation":
#                 rid2relation = dict()
#                 for relation in chain(*para_relations):
#                     if relation.rid not in rid2relation:
#                         rid2relation[relation.rid] = deepcopy(relation)
#                     else:
#                         rid2relation[relation.rid].update(relation)
#                 return sorted(rid2relation.values(), key=lambda r: r.rid)
#             if output_format == "triplet":
#                 return sorted([r.to_triplets() for relations in para_relations for r in relations])

#     def extract_from_parsed_result(self, parsed_result,
#                                    eventuality_output_format="Eventuality",
#                                    relation_output_format="Relation",
#                                    in_order=True, **kw):
#         if eventuality_output_format not in ["Eventuality", "json"]:
#             raise NotImplementedError("Error: extract_eventualities only supports Eventuality or json.")
#         if relation_output_format not in ["Relation", "triplet"]:
#             raise NotImplementedError("Error: extract_relations only supports Relation or triplet.")

#         if not isinstance(parsed_result, (list, tuple, dict)):
#             raise NotImplementedError
#         if isinstance(parsed_result, dict):
#             is_single_sent = True
#             parsed_result = [parsed_result]
#         else:
#             is_single_sent = False

#         syntax_tree_cache = kw.get("syntax_tree_cache", dict())

#         len_sentences = len(parsed_result)
#         para_eventualities = [list() for _ in range(len_sentences)]
#         para_relations = [list() for _ in range(2*len_sentences-1)]

#         connectives = self.conn_extractor.extract(parsed_result, syntax_tree_cache)
#         SS_connectives, PS_connectives = self.argpos_classifier.classify(parsed_result, connectives, syntax_tree_cache)
#         SS_connectives = self.ss_extractor.extract(parsed_result, SS_connectives, syntax_tree_cache)
#         PS_connectives = self.ps_extractor.extract(parsed_result, PS_connectives, syntax_tree_cache)
#         connectives = self.explicit_classifier.classify(parsed_result, SS_connectives+PS_connectives, syntax_tree_cache)
#         connectives.sort(key=lambda x: (x["sent_idx"], x["indices"][0] if len(x["indices"]) > 0 else -1))

#         for connective in connectives:
#             conn_indices = connective.get("indices", None)
#             arg1 = connective.get("arg1", None)
#             arg2 = connective.get("arg2", None)
#             sense = connective.get("sense", None)
#             if conn_indices and arg1 and arg2:
#                 arg1_sent_idx = arg1["sent_idx"]
#                 arg2_sent_idx = arg2["sent_idx"]
#                 senses = []
#                 if arg1_sent_idx == arg2_sent_idx:
#                     senses.append("Co_Occurrence")
#                 if sense and sense != "None":
#                     senses.append(sense)
#                 if len(senses) == 0:
#                     continue
#                 relation_list_idx = arg1_sent_idx if arg1_sent_idx == arg2_sent_idx else arg1_sent_idx + len_sentences
#                 relations = para_relations[relation_list_idx]
#                 sent_parsed_result1, sent_eventualities1 = parsed_result[arg1_sent_idx], para_eventualities[arg1_sent_idx]
#                 sent_parsed_result2, sent_eventualities2 = parsed_result[arg2_sent_idx], para_eventualities[arg2_sent_idx]
#                 arg1_eventualities = self._extract_eventualities_from_clause(sent_parsed_result1, arg1["indices"])
#                 arg2_eventualities = self._extract_eventualities_from_clause(sent_parsed_result2, arg2["indices"])
#                 self._append_new_eventuaities_to_list(sent_eventualities1, arg1_eventualities)
#                 self._append_new_eventuaities_to_list(sent_eventualities2, arg2_eventualities)

#                 cnt = 0.0
#                 if len(arg1_eventualities) > 0 and len(arg2_eventualities) > 0:
#                     cnt = 1.0 / (len(arg1_eventualities) * len(arg2_eventualities))
#                 for e1 in arg1_eventualities:
#                     heid = e1.eid
#                     for e2 in arg2_eventualities:
#                         teid = e2.eid
#                         is_existed = False
#                         for relation in relations:
#                             if relation.hid == heid and relation.tid == teid:
#                                 relation.update({sense: cnt for sense in senses})
#                                 is_existed = True
#                                 break
#                         if not is_existed:
#                             relations.append(Relation(heid, teid, {sense: cnt for sense in senses}))

#         if in_order:
#             if eventuality_output_format == "json":
#                 para_eventualities = [[eventuality.encode(encoding=None) for eventuality in sent_eventualities] \
#                     for sent_eventualities in para_eventualities]
#             if relation_output_format == "triplet":
#                 relations = [list(chain.from_iterable([relation.to_triplet() for relation in sent_relations])) \
#                     for sent_relations in para_relations]
#             if is_single_sent:
#                 return para_eventualities[0], para_relations[0]
#             else:
#                 return para_eventualities, para_relations
#         else:
#             eid2eventuality = dict()
#             for eventuality in chain.from_iterable(para_eventualities):
#                 eid = eventuality.eid
#                 if eid not in eid2eventuality:
#                     eid2eventuality[eid] = deepcopy(eventuality)
#                 else:
#                     eid2eventuality[eid].update(eventuality)
#             if eventuality_output_format == "Eventuality":
#                 eventualities = sorted(eid2eventuality.values(), key=lambda e: e.eid)
#             elif eventuality_output_format == "json":
#                 eventualities = sorted([eventuality.encode(encoding=None) for eventuality in eid2eventuality.values()], key=lambda e: e["eid"])

#             rid2relation = dict()
#             for relation in chain.from_iterable(para_relations):
#                 if relation.rid not in rid2relation:
#                     rid2relation[relation.rid] = deepcopy(relation)
#                 else:
#                     rid2relation[relation.rid].update(relation)
#             if relation_output_format == "Relation":
#                 relations = sorted(rid2relation.values(), key=lambda r: r.rid)
#             elif relation_output_format == "triplet":
#                 relations = sorted(chain.from_iterable([relation.to_triplets() for relation in rid2relation.values()]))
#             return eventualities, relations

#     def _extract_clauses(self, parsed_result, syntax_tree_cache):
#         para_arguments = [set() for _ in range(len(parsed_result))]
#         connectives = self.conn_extractor.extract(parsed_result, syntax_tree_cache)
#         para_connectives = [set() for _ in range(len(parsed_result))]
#         for connective in connectives:
#             sent_idx, indices = connective["sent_idx"], tuple(connective["indices"])
#             para_connectives[sent_idx].add(indices)
#         for sent_idx, sent_parsed_result in enumerate(parsed_result):
#             sent_connectives = para_connectives[sent_idx]
#             sent_arguments = para_arguments[sent_idx]

#             if sent_idx in syntax_tree_cache:
#                 syntax_tree = syntax_tree_cache[sent_idx]
#             else:
#                 syntax_tree = syntax_tree_cache[sent_idx] = SyntaxTree(sent_parsed_result["parse"])

#             # more but slower
#             # for indices in powerset(sent_connectives):
#             #     indices = set(chain.from_iterable(indices))
#             #     sent_arguments.update(get_clauses(sent_parsed_result, syntax_tree, sep_indices=indices))
#             sent_arguments.update(get_clauses(sent_parsed_result, syntax_tree, sep_indices=set(chain.from_iterable(sent_connectives))))
#         return para_arguments