inishchith/HIstop_words.py

## HIstop_words.py
"""
StopList for classical_hindi corpora at CLTK.
"""

import re,os
import string
from cltk.tokenize.indian_tokenizer import indian_punctuation_tokenize_regex as i_word
from nltk.probability import FreqDist
import nltk

path = "./hindi_corpora"        # every .txt file from classical_hindi_corpora is first moved to ./hindi_corpora dir.

if os.path.isfile("./stop_words.txt"):
    os.remove("./stop_words.txt")
if os.path.isfile(path+"/hindi_corpora.txt"):
    os.remove(path+"/hindi_corpora.txt")

punctuation = '''''!()-[]{};:'"\,<>./?@#$%^&*_~'''
extra_punctuation = '||'

total_words = 0

for file_name in os.listdir(path):
    full_path = os.path.join(path, file_name)
    file_content = open(full_path, encoding="utf-8",errors='ignore').read()
    print(full_path)
    texts = ""
    for char in file_content:
        if char not in punctuation + extra_punctuation:
            texts = texts + char
    words = i_word(texts)
    file = open(path+"/hindi_corpora.txt", 'a+')
    total_words += len(list(words))
    word_string = '\n'.join(words)
    file.write(word_string)
    #hindi_words = hindi_words + list(words)

print("TOTAL WORDS : ",total_words)

with open(path + "/hindi_corpora.txt") as fi:
    hindi_words = fi.read().splitlines()

os.remove(path+"/hindi_corpora.txt")
words = hindi_words
fdist = FreqDist(words)
common_words = fdist.most_common(200)
file = open('./stops_words.txt', 'a+')
commonWord_list = [x[0] for x in common_words]
commonWord_list = list(set(commonWord_list))
common_words = '\n'.join(commonWord_list)
file.write(common_words)
print(commonWord_list)

"""
Note : The datasets at hindi_corpora was improper in terms of cleaning , hence some of the redundant/misclassified words had to be manually removed.
This can be avoided as there's a scope of improvement in the punctuation regular expression .
"""
	"""
	StopList for classical_hindi corpora at CLTK.
	"""

	import re,os
	import string
	from cltk.tokenize.indian_tokenizer import indian_punctuation_tokenize_regex as i_word
	from nltk.probability import FreqDist
	import nltk

	path = "./hindi_corpora" # every .txt file from classical_hindi_corpora is first moved to ./hindi_corpora dir.

	if os.path.isfile("./stop_words.txt"):
	os.remove("./stop_words.txt")
	if os.path.isfile(path+"/hindi_corpora.txt"):
	os.remove(path+"/hindi_corpora.txt")

	punctuation = '''''!()-[]{};:'"\,<>./?@#$%^&*_~'''
	extra_punctuation = '\|\|'

	total_words = 0

	for file_name in os.listdir(path):
	full_path = os.path.join(path, file_name)
	file_content = open(full_path, encoding="utf-8",errors='ignore').read()
	print(full_path)
	texts = ""
	for char in file_content:
	if char not in punctuation + extra_punctuation:
	texts = texts + char
	words = i_word(texts)
	file = open(path+"/hindi_corpora.txt", 'a+')
	total_words += len(list(words))
	word_string = '\n'.join(words)
	file.write(word_string)
	#hindi_words = hindi_words + list(words)

	print("TOTAL WORDS : ",total_words)

	with open(path + "/hindi_corpora.txt") as fi:
	hindi_words = fi.read().splitlines()

	os.remove(path+"/hindi_corpora.txt")
	words = hindi_words
	fdist = FreqDist(words)
	common_words = fdist.most_common(200)
	file = open('./stops_words.txt', 'a+')
	commonWord_list = [x[0] for x in common_words]
	commonWord_list = list(set(commonWord_list))
	common_words = '\n'.join(commonWord_list)
	file.write(common_words)
	print(commonWord_list)

	"""
	Note : The datasets at hindi_corpora was improper in terms of cleaning , hence some of the redundant/misclassified words had to be manually removed.
	This can be avoided as there's a scope of improvement in the punctuation regular expression .
	"""