Source code for aser.extract.sentence_parser

import time
import json
from .utils import get_corenlp_client, parse_sentense_with_stanford
from .utils import ANNOTATORS, MAX_LEN

[docs]class SentenceParser: """ Sentence parser to process files that contain raw texts """ def __init__(self, corenlp_path="", corenlp_port=0, **kw): """ :param corenlp_path: corenlp path, e.g., /home/xliucr/stanford-corenlp-3.9.2 :type corenlp_path: str (default = "") :param corenlp_port: corenlp port, e.g., 9000 :type corenlp_port: int (default = 0) :param kw: other parameters :type kw: Dict[str, object] """ self.corenlp_path = corenlp_path self.corenlp_port = corenlp_port self.annotators = kw.get("annotators", list(ANNOTATORS)) _, self.is_externel_corenlp = get_corenlp_client(corenlp_path=self.corenlp_path, corenlp_port=self.corenlp_port)
[docs] def close(self): """ Close the parser safely """ if not self.is_externel_corenlp: corenlp_client, _ = get_corenlp_client(corenlp_path=self.corenlp_path, corenlp_port=self.corenlp_port) corenlp_client.stop()
def __del__(self): self.close()
[docs] def generate_sid(self, sentence, file_name, sid): """ :param sentence: the raw text :type sentence: str :param file_name: the file name :type file_name: str :param line_no: the line number :type line_no: int :return: the corresponding sentence id :rtype: str """ return file_name + "|" + str(sid)
[docs] def parse_raw_file(self, raw_path, processed_path=None, annotators=None, max_len=MAX_LEN): """ Parse all raw texts in the given file :param raw_path: the file path that contains raw texts :type raw_path: str :param processed_path: the file path that stores the parsed result :type processed_path: str :param annotators: annotators for corenlp, please refer to :type annotators: Union[List, None] (default = None) :param max_len: the max length of a paragraph (constituency parsing cannot handle super-long sentences) :type max_len: int (default = 1024) :return: the parsed result :rtype: List[List[Dict[str, object]]] """ if annotators is None: annotators = self.annotators paragraphs = [] try: with open(raw_path, "r", encoding="utf-8", errors="ignore") as f: paragraphs.append("") for line in f: if line.startswith(".START") or line == "\n": if len(paragraphs[-1]) != 0: paragraphs.append("") else: paragraphs[-1] += line if len(paragraphs[-1]) == 0: paragraphs.pop() except BaseException as e: print(raw_path) print(e) raise e if len(paragraphs) == 1 and len(paragraphs[0]) > max_len: paragraphs = paragraphs[0].split("\n") sid = 1 para_lens = [] for i in range(len(paragraphs)): paragraphs[i] = self.parse(paragraphs[i], annotators=annotators, max_len=max_len) para_lens.append(len(paragraphs[i]) + sid) if processed_path: for sent in paragraphs[i]: sent["sid"] = self.generate_sid(sent, processed_path, sid) sid += 1 with open(processed_path, "w", encoding="utf-8") as f: f.write(json.dumps({"sentence_lens": para_lens})) f.write("\n") for para in paragraphs: for sent in para: f.write(json.dumps(sent)) f.write("\n") return paragraphs
[docs] def parse(self, paragraph, annotators=None, max_len=MAX_LEN): """ :param paragraph: a raw text :type paragraph: str :param annotators: annotators for corenlp, please refer to :type annotators: Union[List, None] (default = None) :param max_len: the max length of a paragraph (constituency parsing cannot handle super-long sentences) :type max_len: int (default = 1024) :return: the parsed result :rtype: List[Dict[str, object]] """ if annotators is None: annotators = self.annotators corenlp_client, _ = get_corenlp_client(corenlp_path=self.corenlp_path, corenlp_port=self.corenlp_port, annotators=annotators) parsed_para = parse_sentense_with_stanford(paragraph, corenlp_client, annotators, max_len) for sent_idx, sent in enumerate(parsed_para): sent["sid"] = self.generate_sid(sent, "", sent_idx+1) return parsed_para
if __name__ == "__main__": raw_path = "/Users/sean/OneDrive - HKUST Connect/Documents/HKUST/Research/ASER/example_data/raw/yelp.txt" processed_path = "/Users/sean/OneDrive - HKUST Connect/Documents/HKUST/Research/ASER/example_data/processed/yelp.jsonl" parser = SentenceParser(corenlp_path="", corenlp_port=9000, annotators=list(ANNOTATORS)) start_st = time.time() pared_para = parser.parse_raw_file(raw_path, processed_path) end_st = time.time() print("# Tokens: %d\tTime: %.4fs" % (sum([sum([len(sent["tokens"]) for sent in para]) for para in pared_para]), end_st-start_st))