Last active
October 27, 2018 03:15
-
-
Save zarzen/1a0d51be397779f9154069a1ca1ac501 to your computer and use it in GitHub Desktop.
sentence splitting
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import spacy | |
import string | |
import re | |
from nltk.tokenize.stanford_segmenter import StanfordSegmenter | |
from nltk.tokenize import sent_tokenize | |
def remove_nonascii(document): | |
printable = set(string.printable) | |
rtn = ''.join(filter(lambda x: x in printable, document)) | |
return rtn | |
def _add_whitespace(text): | |
""" Add white space for sentence splitting | |
Some reviews has follow issue: | |
'This review is tailed with another.Another review here cannot be | |
separated by sent-tokenizer.' | |
DONE add white space for a!W a?W | |
Args: | |
text: string contains reviews | |
Returns: | |
String added white space for separation. | |
""" | |
reg = r'([a-z]*[a-z])([\.\!\?])([A-Za-z][a-z]*\ )' | |
# for debugging | |
# matched = re.findall(reg, text) | |
# print("matched: ", len(matched)) | |
# print(matched[0:20]) | |
return re.sub(reg, r'\1\2 \3', text) | |
def sent_split(text): | |
""" | |
remove non-ascii | |
split by '\n' | |
add whitespace | |
sent_tokenize | |
strip sentence | |
remove empty string | |
""" | |
text = remove_nonascii(text) | |
text = text.split('\n') | |
sentences = [] | |
for paragraph in text: | |
p = _add_whitespace(paragraph) | |
p = p.replace('\t', ' ') | |
tokenized = sent_tokenize(p) | |
for s in tokenized: | |
s = s.strip() | |
if s == '': | |
pass | |
else: | |
sentences.append(s) | |
return sentences |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment