Source code for aser.extract.parsed_reader

import json
from pprint import pprint


[docs]class ParsedReader(object):
    """ File reader to read parsed results from Disk

    """
    def __init__(self):
        pass

    def __del__(self):
        self.close()

[docs]    def close(self):
        pass

[docs]    def generate_sid(self, sentence, file_name, line_no):
        """

        :param sentence: the raw text
        :type sentence: str
        :param file_name: the file name
        :type file_name: str
        :param line_no: the line number
        :type line_no: int
        :return: the corresponding sentence id
        :rtype: str
        """

        return file_name + "|" + str(line_no)

[docs]    def get_parsed_paragraphs_from_file(self, processed_path):
        """ This method retrieves all paragraphs from a processed file

        :type processed_path: str or None
        :param processed_path: the file path of the processed file
        :return: a list of lists of dicts
        """
        with open(processed_path, "r") as f:
            sent_len = json.loads(f.readline())["sentence_lens"]
            paragraphs = list()
            line_no = 1
            para_idx = 0
            while para_idx < len(sent_len):
                paragraph = list()
                end_no = sent_len[para_idx]
                while line_no < end_no:
                    sent = json.loads(f.readline())
                    sent["sid"] = self.generate_sid(sent, processed_path, line_no)
                    paragraph.append(sent)
                    line_no += 1
                para_idx += 1
                paragraphs.append(paragraph)
        return paragraphs

[docs]    def get_parsed_sentence_and_context(self, sid, context_window_size=0):
        """ Retrieve the parsed results of the corresponding sentence and its context

        :param sid: the sentence id
        :type sid: str
        :param context_window_size: the context window size
        :type context_window_size: int (default = 0)
        :return: a dictionary that contains the "sentence", "left_context", and "right_context"
        :rtype: Dict[str, object]
        """

        file_name, line_no = sid.rsplit("|", 1)
        line_no = int(line_no)
        sent, lctx, rctx = None, list(), list()
        with open(file_name, "r") as f:
            sent_len = json.loads(f.readline())["sentence_lens"]
            if len(sent_len) == 0:
                print("id:{} exceeds file limit.. file:{} is empty".format(sid, file_name))
            elif line_no >= sent_len[-1]:
                print("id:{} exceeds file limit.. file:{} only have {} lines".format(sid, file_name, sent_len[-1] - 1))
            else:
                for _ in range(line_no - 1 - context_window_size):
                    f.readline()

                # left context
                lctx_num = line_no - 1 if line_no - context_window_size < 1 else context_window_size
                for l_line_no in range(line_no - lctx_num, line_no):
                    l_sent = json.loads(f.readline())
                    l_sent["sid"] = self.generate_sid(l_sent, file_name, l_line_no)
                    lctx.append(l_sent)

                # sentence
                sent = json.loads(f.readline())
                sent["sid"] = self.generate_sid(sent, file_name, line_no)

                # right context
                rctx_num = sent_len[-1] - line_no - 1 if line_no + 1 + context_window_size > sent_len[
                    -1] else context_window_size
                for r_line_no in range(line_no + 1, line_no + 1 + rctx_num):
                    r_sent = json.loads(f.readline())
                    r_sent["sid"] = self.generate_sid(r_sent, file_name, r_line_no)
                    rctx.append(r_sent)
        return {"sentence": sent, "left_context": lctx, "right_context": rctx}


if __name__ == "__main__":
    sid = "/Users/sean/OneDrive - HKUST Connect/Documents/HKUST/Research/ASER/example_data/processed/yelp.jsonl|1"
    parse_reader = ParsedReader()
    res = parse_reader.get_parsed_sentence_and_context(sid, 2)
    for k, v in res.items():
        if k == "sentence":
            if v is None:
                pprint({k: v})
            else:
                pprint({k: v["text"]})
        else:
            if v is None:
                pprint({k: v})
            else:
                pprint({k: [i["text"] for i in v]})