Skip to content

Instantly share code, notes, and snippets.

@zarzen
Last active October 27, 2018 03:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zarzen/1a0d51be397779f9154069a1ca1ac501 to your computer and use it in GitHub Desktop.
Save zarzen/1a0d51be397779f9154069a1ca1ac501 to your computer and use it in GitHub Desktop.
sentence splitting
import json
import spacy
import string
import re
from nltk.tokenize.stanford_segmenter import StanfordSegmenter
from nltk.tokenize import sent_tokenize
def remove_nonascii(document):
printable = set(string.printable)
rtn = ''.join(filter(lambda x: x in printable, document))
return rtn
def _add_whitespace(text):
""" Add white space for sentence splitting
Some reviews has follow issue:
'This review is tailed with another.Another review here cannot be
separated by sent-tokenizer.'
DONE add white space for a!W a?W
Args:
text: string contains reviews
Returns:
String added white space for separation.
"""
reg = r'([a-z]*[a-z])([\.\!\?])([A-Za-z][a-z]*\ )'
# for debugging
# matched = re.findall(reg, text)
# print("matched: ", len(matched))
# print(matched[0:20])
return re.sub(reg, r'\1\2 \3', text)
def sent_split(text):
"""
remove non-ascii
split by '\n'
add whitespace
sent_tokenize
strip sentence
remove empty string
"""
text = remove_nonascii(text)
text = text.split('\n')
sentences = []
for paragraph in text:
p = _add_whitespace(paragraph)
p = p.replace('\t', ' ')
tokenized = sent_tokenize(p)
for s in tokenized:
s = s.strip()
if s == '':
pass
else:
sentences.append(s)
return sentences
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment