Skip to content

Instantly share code, notes, and snippets.

View techykajal's full-sized avatar
🎯
Focusing

kajal yadav techykajal

🎯
Focusing
View GitHub Profile
def Pos_tagging(text):
"""
This function will tag part of speeches corresponding to every tokens in the Corpus using NLTK.
"""
tagged_articles=[]
for articles in text:
tagged = nltk.pos_tag(articles)
#print(tagged[100:150])
tagged_articles.append(tagged)
#print(tagged_articles)
# Pre-processing for Content
List_Content = DF['Content_nGrams'].to_list()
Final_Article = []
Complete_Content = []
for article in List_Content:
Processed_Content = text_preprocessing(article) #Cleaned text of Content attribute after pre-processing
Final_Article.append(Processed_Content)
Complete_Content.extend(Final_Article)
DF['Updated_content'] = Complete_Content
#print(Complete_Content)
# Writing main function to merge all the preprocessing steps.
def text_preprocessing(text, punctuations=True, token = True,
stop_words=True, apostrophe=False, verbs=False):
"""
This function will preprocess input text and return
the clean text.
"""
stoplist = stopwords.words('english')
stoplist = set(stoplist)
def tokenize_text(Updated_content):
"""
This function will tokenize the word after removing stopwords & punctuations
and return the list of list of articles.
"""
tokenized_text = [word for word in word_tokenize(Updated_content)]
return tokenized_text
def removing_special_characters(text):
"""Removing all the special characters except the one that is passed within
the regex to match, as they have imp meaning in the text provided.
arguments:
input_text: "text" of type "String".
return:
value: Text with removed special characters that don't require.
def removing_stopwords(text):
"""This function will remove stopwords which doesn't add much meaning to a sentence
& they can be remove safely without comprimising meaning of the sentence.
arguments:
input_text: "text" of type "String".
return:
value: Text after omitted all stopwords.
def add_ngrams_to_input(Processed_content,Mapping):
"""
This function will replace original occurrence of n_Grams in the text with that of Combined n_Grams.
"""
for i in range(len(Processed_content)):
for key, value in Mapping.items():
Processed_content[i] = Processed_content[i].replace(key, value)
return Processed_content
content_nGrams = add_ngrams_to_input(Processed_Content,Mapping)
def mapping(n_grams_to_use, Combined_nGrams):
"""
This function will map combined n_Grams with that of individual n_Grams & return the dictionary.
"""
dic=dict()
for i in range(len(Combined_nGrams)):
dic[n_grams_to_use[i]] = Combined_nGrams[i]
return dic
Mapping = mapping(n_grams_to_use, Combined_nGrams)
Mapping
# Combine each n_Gram using '_'
def combined_n_Grams(n_grams_to_use):
"""
This function will read n_Grams & return list of combined n_Grams using '_'
"""
Combined_nGrams = []
for i in range(len(n_grams_to_use)):
Combined_nGrams.append(n_grams_to_use[i].replace(' ','_'))
return Combined_nGrams
Combined_nGrams = combined_n_Grams(n_grams_to_use)
def read_nGrams():
"""
This function will read bigrams & trigrams and
return list of n_Grams.
"""
# read bigrams
original_bigram = readFile("bigram.txt")
# read trigrams
original_trigram = readFile("trigram.txt")