Skip to content

Instantly share code, notes, and snippets.

@inishchith
Created January 31, 2018 15:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save inishchith/ad4bc0da200110de638f5408c64bb14c to your computer and use it in GitHub Desktop.
Save inishchith/ad4bc0da200110de638f5408c64bb14c to your computer and use it in GitHub Desktop.
This list is composed from 100 most frequently occuring words in classical_hindi corpus <https://github.com/cltk/hindi_text_ltrc> in CLTK.
"""
StopList for classical_hindi corpora at CLTK.
"""
import re,os
import string
from cltk.tokenize.indian_tokenizer import indian_punctuation_tokenize_regex as i_word
from nltk.probability import FreqDist
import nltk
path = "./hindi_corpora" # every .txt file from classical_hindi_corpora is first moved to ./hindi_corpora dir.
if os.path.isfile("./stop_words.txt"):
os.remove("./stop_words.txt")
if os.path.isfile(path+"/hindi_corpora.txt"):
os.remove(path+"/hindi_corpora.txt")
punctuation = '''''!()-[]{};:'"\,<>./?@#$%^&*_~'''
extra_punctuation = '||'
total_words = 0
for file_name in os.listdir(path):
full_path = os.path.join(path, file_name)
file_content = open(full_path, encoding="utf-8",errors='ignore').read()
print(full_path)
texts = ""
for char in file_content:
if char not in punctuation + extra_punctuation:
texts = texts + char
words = i_word(texts)
file = open(path+"/hindi_corpora.txt", 'a+')
total_words += len(list(words))
word_string = '\n'.join(words)
file.write(word_string)
#hindi_words = hindi_words + list(words)
print("TOTAL WORDS : ",total_words)
with open(path + "/hindi_corpora.txt") as fi:
hindi_words = fi.read().splitlines()
os.remove(path+"/hindi_corpora.txt")
words = hindi_words
fdist = FreqDist(words)
common_words = fdist.most_common(200)
file = open('./stops_words.txt', 'a+')
commonWord_list = [x[0] for x in common_words]
commonWord_list = list(set(commonWord_list))
common_words = '\n'.join(commonWord_list)
file.write(common_words)
print(commonWord_list)
"""
Note : The datasets at hindi_corpora was improper in terms of cleaning , hence some of the redundant/misclassified words had to be manually removed.
This can be avoided as there's a scope of improvement in the punctuation regular expression .
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment