Madhivarman/text_segment.py

## text_segment.py
"""Sentence segmentation, means, to split a given paragraph of text into sentences, by identifying the sentence boundaries.
In many cases, a full stop is all that is required to identify the end of a sentence, but the task is not all that simple.
This is an open ended challenge to which there are no perfect solutions. Try to break up given paragraphs into text into
individual sentences. Even if you don't manage to segment the text perfectly, the more sentences you identify and display
correctly, the more you will score."""


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

class TextSegmentation():

	def  segmentation(self,text,pos_tagging):
		count = 0
		complete_sentence=[]
		#loop through the sentence
		for words in pos_tagging:
			#count number of sentences
			if words[1] == '.':
				count += 1

		print("Number of sentences:{}".format(count))

		#now split the sentences
		for words in pos_tagging:

			if words[1] == '.':
				replace = words[0].replace(".","stopped")
				complete_sentence.append(replace)
			else:
				complete_sentence.append(words[0])

		print("Complete Sentence:{}".format(complete_sentence))

		#now we can split
		join_sentence = " ".join(complete_sentence)
		#print("joined_sentence:{}".format(join_sentence))
		final_splitted_sentence = join_sentence.split("stopped")
		print("Splitted Sentence is:{}\n".format(final_splitted_sentence))

		return final_splitted_sentence

	def printoutput(self,result):
		count = 1
		print("Final result is:")
		print("-------------------------------------------------------------")
		for sent in result:
			print(count,sent)
			count +=1

def main(text):

	word_tokens = word_tokenize(text)

	#pos tagging
	pos_tag_sentence =  nltk.pos_tag(word_tokens)

	return pos_tag_sentence

def isconditionistrue(user_text,pos):

	#list to count number of characters
	char = []

	for w in user_text:
		char.append(w)

	if len(w) <= 10000:
		if len(pos) <= 1000:
			return "1"
	else:
		return "0"

if __name__ == '__main__':

	text = raw_input("Enter your text here:\n")
	#pos_tag sentence
	#To find its grammer
	pos_tagging = main(text) #return as list
	#print(pos_tagging)
	#condition to meet certain constraints
	if(isconditionistrue(text,pos_tagging) == '1'):
		obj = TextSegmentation()
		split = obj.segmentation(text,pos_tagging)
		obj.printoutput(split)
	else:
		print("Certain constraints failed")
	"""Sentence segmentation, means, to split a given paragraph of text into sentences, by identifying the sentence boundaries.
	In many cases, a full stop is all that is required to identify the end of a sentence, but the task is not all that simple.
	This is an open ended challenge to which there are no perfect solutions. Try to break up given paragraphs into text into
	individual sentences. Even if you don't manage to segment the text perfectly, the more sentences you identify and display
	correctly, the more you will score."""


	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize

	class TextSegmentation():

	def segmentation(self,text,pos_tagging):
	count = 0
	complete_sentence=[]
	#loop through the sentence
	for words in pos_tagging:
	#count number of sentences
	if words[1] == '.':
	count += 1

	print("Number of sentences:{}".format(count))

	#now split the sentences
	for words in pos_tagging:

	if words[1] == '.':
	replace = words[0].replace(".","stopped")
	complete_sentence.append(replace)
	else:
	complete_sentence.append(words[0])

	print("Complete Sentence:{}".format(complete_sentence))

	#now we can split
	join_sentence = " ".join(complete_sentence)
	#print("joined_sentence:{}".format(join_sentence))
	final_splitted_sentence = join_sentence.split("stopped")
	print("Splitted Sentence is:{}\n".format(final_splitted_sentence))

	return final_splitted_sentence

	def printoutput(self,result):
	count = 1
	print("Final result is:")
	print("-------------------------------------------------------------")
	for sent in result:
	print(count,sent)
	count +=1

	def main(text):

	word_tokens = word_tokenize(text)

	#pos tagging
	pos_tag_sentence = nltk.pos_tag(word_tokens)

	return pos_tag_sentence

	def isconditionistrue(user_text,pos):

	#list to count number of characters
	char = []

	for w in user_text:
	char.append(w)

	if len(w) <= 10000:
	if len(pos) <= 1000:
	return "1"
	else:
	return "0"

	if __name__ == '__main__':

	text = raw_input("Enter your text here:\n")
	#pos_tag sentence
	#To find its grammer
	pos_tagging = main(text) #return as list
	#print(pos_tagging)
	#condition to meet certain constraints
	if(isconditionistrue(text,pos_tagging) == '1'):
	obj = TextSegmentation()
	split = obj.segmentation(text,pos_tagging)
	obj.printoutput(split)
	else:
	print("Certain constraints failed")