zarzen/sent_split.py

## sent_split.py
import json
import spacy
import string
import re

from nltk.tokenize.stanford_segmenter import StanfordSegmenter
from nltk.tokenize import sent_tokenize

def remove_nonascii(document):
    printable = set(string.printable)
    rtn = ''.join(filter(lambda x: x in printable, document))
    return rtn


def _add_whitespace(text):
    """ Add white space for sentence splitting

    Some reviews has follow issue:
    'This review is tailed with another.Another review here cannot be
    separated by sent-tokenizer.'

    DONE add white space for a!W a?W

    Args:
      text: string contains reviews
    Returns:
      String added white space for separation.
    """
    reg = r'([a-z]*[a-z])([\.\!\?])([A-Za-z][a-z]*\ )'
    # for debugging
    # matched = re.findall(reg, text)
    # print("matched: ", len(matched))
    # print(matched[0:20])
    return re.sub(reg, r'\1\2 \3', text)


def sent_split(text):
    """
    remove non-ascii
    split by '\n'
    add whitespace
    sent_tokenize
    strip sentence
    remove empty string
    """
    text = remove_nonascii(text)
    text = text.split('\n')
    sentences = []
    for paragraph in text:
        p = _add_whitespace(paragraph)
        p = p.replace('\t', '    ')
        tokenized = sent_tokenize(p)
        for s in tokenized:
            s = s.strip()
            if s == '':
                pass
            else:
                sentences.append(s)
    return sentences
	import json
	import spacy
	import string
	import re

	from nltk.tokenize.stanford_segmenter import StanfordSegmenter
	from nltk.tokenize import sent_tokenize

	def remove_nonascii(document):
	printable = set(string.printable)
	rtn = ''.join(filter(lambda x: x in printable, document))
	return rtn



	def _add_whitespace(text):
	""" Add white space for sentence splitting

	Some reviews has follow issue:
	'This review is tailed with another.Another review here cannot be
	separated by sent-tokenizer.'

	DONE add white space for a!W a?W

	Args:
	text: string contains reviews
	Returns:
	String added white space for separation.
	"""
	reg = r'([a-z][a-z])([\.\!\?])([A-Za-z][a-z]\ )'
	# for debugging
	# matched = re.findall(reg, text)
	# print("matched: ", len(matched))
	# print(matched[0:20])
	return re.sub(reg, r'\1\2 \3', text)


	def sent_split(text):
	"""
	remove non-ascii
	split by '\n'
	add whitespace
	sent_tokenize
	strip sentence
	remove empty string
	"""
	text = remove_nonascii(text)
	text = text.split('\n')
	sentences = []
	for paragraph in text:
	p = _add_whitespace(paragraph)
	p = p.replace('\t', ' ')
	tokenized = sent_tokenize(p)
	for s in tokenized:
	s = s.strip()
	if s == '':
	pass
	else:
	sentences.append(s)
	return sentences