Skip to content

Instantly share code, notes, and snippets.

@andrea-dagostino
Last active April 9, 2022 23:08
Show Gist options
  • Save andrea-dagostino/4d876dc236865b71544aee8f6f716594 to your computer and use it in GitHub Desktop.
Save andrea-dagostino/4d876dc236865b71544aee8f6f716594 to your computer and use it in GitHub Desktop.
posts/raggruppamento-testuale-con-tf-idf
# import required sklearn libs
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
# import other required libs
import pandas as pd
import numpy as np
# string manipulation libs
import re
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
# viz libs
import matplotlib.pyplot as plt
import seaborn as sns
categories = [
'comp.graphics',
'comp.os.ms-windows.misc',
'rec.sport.baseball',
'rec.sport.hockey',
'alt.atheism',
'soc.religion.christian',
]
dataset = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, remove=('headers', 'footers', 'quotes'))
df = pd.DataFrame(dataset.data, columns=["corpus"])
df['cleaned'] = df['corpus'].apply(lambda x: preprocess_text(x, remove_stopwords=True))
# initialize vectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.95)
# fit_transform applies TF-IDF to clean texts - we save the array of vectors in X
X = vectorizer.fit_transform(df['cleaned'])
# initialize KMeans with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X)
clusters = kmeans.labels_
# initialize PCA with 2 components
pca = PCA(n_components=2, random_state=42)
# pass X to the pca
pca_vecs = pca.fit_transform(X.toarray())
# save the two dimensions in x0 and x1
x0 = pca_vecs[:, 0]
x1 = pca_vecs[:, 1]
# assign clusters and PCA vectors to columns in the original dataframe
df['cluster'] = clusters
df['x0'] = x0
df['x1'] = x1
cluster_map = {0: "sport", 1: "technology", 2: "religion"} # mapping found through get_top_keywords
df['cluster'] = df['cluster'].map(cluster_map)
# set image size
plt.figure(figsize=(12, 7))
# set title
plt.title("Raggruppamento TF-IDF + KMeans 20newsgroup", fontdict={"fontsize": 18})
# set axes names
plt.xlabel("X0", fontdict={"fontsize": 16})
plt.ylabel("X1", fontdict={"fontsize": 16})
# create scatter plot with seaborn, where hue is the class used to group the data
sns.scatterplot(data=df, x='x0', y='x1', hue='cluster', palette="viridis")
plt.show()
def preprocess_text(text: str, remove_stopwords: bool) -> str:
"""This function cleans the input text by
- removing links
- removing special chars
- removing numbers
- removing stopwords
- transforming in lower case
- removing excessive whitespaces
Arguments:
text (str): text to clean
remove_stopwords (bool): remove stopwords or not
Returns:
str: cleaned text
"""
# remove links
text = re.sub(r"http\S+", "", text)
# remove numbers and special chars
text = re.sub("[^A-Za-z]+", " ", text)
# remove stopwords
if remove_stopwords:
# 1. creates tokens
tokens = nltk.word_tokenize(text)
# 2. checks if token is a stopword and removes it
tokens = [w for w in tokens if not w.lower() in stopwords.words("english")]
# 3. joins all tokens again
text = " ".join(tokens)
# returns cleaned text
text = text.lower().strip()
return text
def get_top_keywords(n_terms):
"""This function returns the keywords for each centroid of the KMeans"""
df = pd.DataFrame(X.todense()).groupby(clusters).mean() # groups tf idf vector per cluster
terms = vectorizer.get_feature_names_out() # access to tf idf terms
for i,r in df.iterrows():
print('\nCluster {}'.format(i))
print(','.join([terms[t] for t in np.argsort(r)[-n_terms:]])) # for each row of the dataframe, find the n terms that have the highest tf idf score
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment