Skip to content

Instantly share code, notes, and snippets.

@puraminy
Created December 25, 2019 17:00
Show Gist options
  • Save puraminy/2836a6643b7a4191e7ea36e08c12c1b3 to your computer and use it in GitHub Desktop.
Save puraminy/2836a6643b7a4191e7ea36e08c12c1b3 to your computer and use it in GitHub Desktop.
Calculate TF-IDF
import nltk, re, pprint
from nltk import word_tokenize
import numpy as np
import pandas as pd
import heapq
import string
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from eval_conf_mat import *
nltk.download('stopwords')
nltk.download('punkt')
data_path = '/content/drive/My Drive/IR-CA3/'
table = str.maketrans('', '', string.punctuation)
wnl = nltk.WordNetLemmatizer()
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# Preprocess Data
def prepare_data(df, table, stopwords):
data = []
porter = nltk.PorterStemmer()
wordfreq = {}
for i in range(len(df)):
text = df.loc[i, 'text']
tokens = word_tokenize(text)
stripped = [w.translate(table) for w in tokens]
words = [word for word in stripped if word.isalpha()]
words = [w.lower() for w in words if not w in stop_words]
stems = [porter.stem(t) for t in words]
for word_stem in stems:
if word_stem not in wordfreq.keys():
wordfreq[word_stem] = 1
else:
wordfreq[word_stem] += 1
data.append(stems)
print("News data was prepared")
return data, wordfreq
# Calculate TF-IDF
def tfidf_model(data, most_freq):
word_idf_values = {}
for token in most_freq:
doc_containing_word = 0
for document in data:
if token in document:
doc_containing_word += 1
word_idf_values[token] = np.log(len(news)/(1 + doc_containing_word))
word_tf_values = {}
for token in most_freq:
doc_tf_vector = []
for document in data:
doc_freq = 1
for word in document:
if token == word:
doc_freq += 1
word_tf = doc_freq/(len(document) + len(most_freq))
doc_tf_vector.append(word_tf)
word_tf_values[token] = doc_tf_vector
tfidf_values = []
for token in word_tf_values.keys():
tfidf_docs = []
for tf_doc in word_tf_values[token]:
tf_idf_score = tf_doc * word_idf_values[token]
tfidf_docs.append(tf_idf_score)
tfidf_values.append(tfidf_docs)
tf_idf_model = np.asarray(tfidf_values)
tf_idf_model = np.transpose(tf_idf_model)
return tf_idf_model
news_file = data_path + 'news.csv'
print("loading news dataset....")
news = pd.read_csv(news_file)
# print(news.keys())
print("Number of records:",len(news))
print("Preparing data...")
data, wordfreq = prepare_data(news, table, stop_words)
print("Training ...")
acc_list = {}
for features_num in [100,200,400,800,1600,3200]:
print("Feature Nums:", features_num)
most_freq = heapq.nlargest(features_num, wordfreq, key=wordfreq.get)
X = tfidf_model(data, most_freq)
y = [re.sub('[^A-Za-z]', ' ', str(y)).strip(' ') for y in news.loc[:, 'category']]
classifier = GaussianNB()
classes = ["sport", "tech","business","entertainment", "politics"]
acc, cr = evalsplit_conf(classifier, X, y, 0.2, True, classes)
print("accuracy:", acc.round(4))
acc_list[features_num] = acc.round(4)
print(cr)
# print(tfidf_values)
fig, ax = plt.subplots(figsize=(10,10))
plot_dict(acc_list, plt)
print(acc_list)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment