This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print("Total words in the corpus: ", len(data_corpus[4])) | |
for i in range(len(data_corpus[4])): | |
print("\t ",i, ": ", doc_lda[1][i], doc_lda[2][i]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Topic distribution for fifth article | |
print("Category of Article 5: ", df["Category"][4]) | |
print("Article 5: ", df["Text"][4]) | |
doc_lda = lda_model[data_corpus][4] | |
print("\nTopic Distribution in the fifth article: ", doc_lda[0]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pprint(lda_model.print_topics()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Construct LDA model | |
lda_model = gensim.models.ldamodel.LdaModel(corpus=data_corpus, | |
id2word=data_dict, | |
num_topics=5, | |
chunksize=100, | |
alpha='auto', | |
per_word_topics=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Better readable representation of corpus | |
data_corpus_word = [[(data_dict[id], freq) for id, freq in cp] for cp in data_corpus[:1]] | |
print("Corpus: \n", data_corpus_word[:1]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Create Dictionary | |
data_dict = corpora.Dictionary(data_tokens_lem) | |
print("Dictionary: ", data_dict) | |
#Create Corpus | |
data_corpus = [data_dict.doc2bow(text) for text in data_tokens_lem] | |
print("Corpus: \n", data_corpus[:1]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): | |
texts_out = [] | |
for sent in texts: | |
doc = nlp(" ".join(sent)) | |
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) | |
return texts_out | |
#Eliminate Stop Words | |
stop_words = stopwords.words('english') | |
stop_words.extend(['say', 'may', 'also', 'get', 'go', 'know', 'need', 'like', 'make', 'see', 'want', 'say', 'come', 'take', 'use', 'would', 'tell', 'could', 'include', 'can', 'bbc', 'mr', 'mrs']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Function to clean the text and remove punctuations | |
def normalized_text(text, stem_words=True): | |
if pd.isnull(text): | |
return '' | |
if type(text) != str or text=='': | |
return '' | |
text = re.sub("\s+", " ", text) | |
text = re.sub("\'s", " ", text) | |
text = re.sub("\'ve", " have ", text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df = pd.read_csv('/content/BBC News Data.csv') | |
df = df.dropna().reset_index(drop=True) | |
print("Shape: ", df.shape) | |
print("Unique Categories: ", df.Category.unique()) | |
print(df.head()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import re | |
import numpy as np | |
from string import punctuation | |
from pprint import pprint | |
import gensim | |
from gensim import corpora | |
from gensim.models import Phrases | |
from gensim.models.phrases import Phraser |