Created
December 25, 2019 17:00
-
-
Save puraminy/2836a6643b7a4191e7ea36e08c12c1b3 to your computer and use it in GitHub Desktop.
Calculate TF-IDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk, re, pprint | |
from nltk import word_tokenize | |
import numpy as np | |
import pandas as pd | |
import heapq | |
import string | |
from sklearn.naive_bayes import GaussianNB | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.model_selection import train_test_split | |
from eval_conf_mat import * | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
data_path = '/content/drive/My Drive/IR-CA3/' | |
table = str.maketrans('', '', string.punctuation) | |
wnl = nltk.WordNetLemmatizer() | |
from nltk.corpus import stopwords | |
stop_words = stopwords.words('english') | |
# Preprocess Data | |
def prepare_data(df, table, stopwords): | |
data = [] | |
porter = nltk.PorterStemmer() | |
wordfreq = {} | |
for i in range(len(df)): | |
text = df.loc[i, 'text'] | |
tokens = word_tokenize(text) | |
stripped = [w.translate(table) for w in tokens] | |
words = [word for word in stripped if word.isalpha()] | |
words = [w.lower() for w in words if not w in stop_words] | |
stems = [porter.stem(t) for t in words] | |
for word_stem in stems: | |
if word_stem not in wordfreq.keys(): | |
wordfreq[word_stem] = 1 | |
else: | |
wordfreq[word_stem] += 1 | |
data.append(stems) | |
print("News data was prepared") | |
return data, wordfreq | |
# Calculate TF-IDF | |
def tfidf_model(data, most_freq): | |
word_idf_values = {} | |
for token in most_freq: | |
doc_containing_word = 0 | |
for document in data: | |
if token in document: | |
doc_containing_word += 1 | |
word_idf_values[token] = np.log(len(news)/(1 + doc_containing_word)) | |
word_tf_values = {} | |
for token in most_freq: | |
doc_tf_vector = [] | |
for document in data: | |
doc_freq = 1 | |
for word in document: | |
if token == word: | |
doc_freq += 1 | |
word_tf = doc_freq/(len(document) + len(most_freq)) | |
doc_tf_vector.append(word_tf) | |
word_tf_values[token] = doc_tf_vector | |
tfidf_values = [] | |
for token in word_tf_values.keys(): | |
tfidf_docs = [] | |
for tf_doc in word_tf_values[token]: | |
tf_idf_score = tf_doc * word_idf_values[token] | |
tfidf_docs.append(tf_idf_score) | |
tfidf_values.append(tfidf_docs) | |
tf_idf_model = np.asarray(tfidf_values) | |
tf_idf_model = np.transpose(tf_idf_model) | |
return tf_idf_model | |
news_file = data_path + 'news.csv' | |
print("loading news dataset....") | |
news = pd.read_csv(news_file) | |
# print(news.keys()) | |
print("Number of records:",len(news)) | |
print("Preparing data...") | |
data, wordfreq = prepare_data(news, table, stop_words) | |
print("Training ...") | |
acc_list = {} | |
for features_num in [100,200,400,800,1600,3200]: | |
print("Feature Nums:", features_num) | |
most_freq = heapq.nlargest(features_num, wordfreq, key=wordfreq.get) | |
X = tfidf_model(data, most_freq) | |
y = [re.sub('[^A-Za-z]', ' ', str(y)).strip(' ') for y in news.loc[:, 'category']] | |
classifier = GaussianNB() | |
classes = ["sport", "tech","business","entertainment", "politics"] | |
acc, cr = evalsplit_conf(classifier, X, y, 0.2, True, classes) | |
print("accuracy:", acc.round(4)) | |
acc_list[features_num] = acc.round(4) | |
print(cr) | |
# print(tfidf_values) | |
fig, ax = plt.subplots(figsize=(10,10)) | |
plot_dict(acc_list, plt) | |
print(acc_list) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment