import time
import json
from .utils import get_corenlp_client, parse_sentense_with_stanford
from .utils import ANNOTATORS, MAX_LEN
[docs]class SentenceParser:
""" Sentence parser to process files that contain raw texts
"""
def __init__(self, corenlp_path="", corenlp_port=0, **kw):
"""
:param corenlp_path: corenlp path, e.g., /home/xliucr/stanford-corenlp-3.9.2
:type corenlp_path: str (default = "")
:param corenlp_port: corenlp port, e.g., 9000
:type corenlp_port: int (default = 0)
:param kw: other parameters
:type kw: Dict[str, object]
"""
self.corenlp_path = corenlp_path
self.corenlp_port = corenlp_port
self.annotators = kw.get("annotators", list(ANNOTATORS))
_, self.is_externel_corenlp = get_corenlp_client(corenlp_path=self.corenlp_path, corenlp_port=self.corenlp_port)
[docs] def close(self):
""" Close the parser safely
"""
if not self.is_externel_corenlp:
corenlp_client, _ = get_corenlp_client(corenlp_path=self.corenlp_path, corenlp_port=self.corenlp_port)
corenlp_client.stop()
def __del__(self):
self.close()
[docs] def generate_sid(self, sentence, file_name, sid):
"""
:param sentence: the raw text
:type sentence: str
:param file_name: the file name
:type file_name: str
:param line_no: the line number
:type line_no: int
:return: the corresponding sentence id
:rtype: str
"""
return file_name + "|" + str(sid)
[docs] def parse_raw_file(self, raw_path, processed_path=None, annotators=None, max_len=MAX_LEN):
""" Parse all raw texts in the given file
:param raw_path: the file path that contains raw texts
:type raw_path: str
:param processed_path: the file path that stores the parsed result
:type processed_path: str
:param annotators: annotators for corenlp, please refer to https://stanfordnlp.github.io/CoreNLP/annotators.html
:type annotators: Union[List, None] (default = None)
:param max_len: the max length of a paragraph (constituency parsing cannot handle super-long sentences)
:type max_len: int (default = 1024)
:return: the parsed result
:rtype: List[List[Dict[str, object]]]
"""
if annotators is None:
annotators = self.annotators
paragraphs = []
try:
with open(raw_path, "r", encoding="utf-8", errors="ignore") as f:
paragraphs.append("")
for line in f:
if line.startswith(".START") or line == "\n":
if len(paragraphs[-1]) != 0:
paragraphs.append("")
else:
paragraphs[-1] += line
if len(paragraphs[-1]) == 0:
paragraphs.pop()
except BaseException as e:
print(raw_path)
print(e)
raise e
if len(paragraphs) == 1 and len(paragraphs[0]) > max_len:
paragraphs = paragraphs[0].split("\n")
sid = 1
para_lens = []
for i in range(len(paragraphs)):
paragraphs[i] = self.parse(paragraphs[i], annotators=annotators, max_len=max_len)
para_lens.append(len(paragraphs[i]) + sid)
if processed_path:
for sent in paragraphs[i]:
sent["sid"] = self.generate_sid(sent, processed_path, sid)
sid += 1
with open(processed_path, "w", encoding="utf-8") as f:
f.write(json.dumps({"sentence_lens": para_lens}))
f.write("\n")
for para in paragraphs:
for sent in para:
f.write(json.dumps(sent))
f.write("\n")
return paragraphs
[docs] def parse(self, paragraph, annotators=None, max_len=MAX_LEN):
"""
:param paragraph: a raw text
:type paragraph: str
:param annotators: annotators for corenlp, please refer to https://stanfordnlp.github.io/CoreNLP/annotators.html
:type annotators: Union[List, None] (default = None)
:param max_len: the max length of a paragraph (constituency parsing cannot handle super-long sentences)
:type max_len: int (default = 1024)
:return: the parsed result
:rtype: List[Dict[str, object]]
"""
if annotators is None:
annotators = self.annotators
corenlp_client, _ = get_corenlp_client(corenlp_path=self.corenlp_path, corenlp_port=self.corenlp_port, annotators=annotators)
parsed_para = parse_sentense_with_stanford(paragraph, corenlp_client, annotators, max_len)
for sent_idx, sent in enumerate(parsed_para):
sent["sid"] = self.generate_sid(sent, "", sent_idx+1)
return parsed_para
if __name__ == "__main__":
raw_path = "/Users/sean/OneDrive - HKUST Connect/Documents/HKUST/Research/ASER/example_data/raw/yelp.txt"
processed_path = "/Users/sean/OneDrive - HKUST Connect/Documents/HKUST/Research/ASER/example_data/processed/yelp.jsonl"
parser = SentenceParser(corenlp_path="", corenlp_port=9000, annotators=list(ANNOTATORS))
start_st = time.time()
pared_para = parser.parse_raw_file(raw_path, processed_path)
end_st = time.time()
print("# Tokens: %d\tTime: %.4fs" % (sum([sum([len(sent["tokens"]) for sent in para]) for para in pared_para]), end_st-start_st))