# import required sklearn libs
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
# import other required libs
import pandas as pd
import numpy as np
# string manipulation libs
import re
import string
import nltk'punkt')'stopwords')
from nltk.corpus import stopwords
# viz libs
import matplotlib.pyplot as plt
import seaborn as sns
categories = [
dataset = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, remove=('headers', 'footers', 'quotes'))
df = pd.DataFrame(, columns=["corpus"])
df['cleaned'] = df['corpus'].apply(lambda x: preprocess_text(x, remove_stopwords=True))
# initialize vectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, max_df=0.95)
# fit_transform applies TF-IDF to clean texts - we save the array of vectors in X
X = vectorizer.fit_transform(df['cleaned'])
# initialize KMeans with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.labels_
# initialize PCA with 2 components
pca = PCA(n_components=2, random_state=42)
# pass X to the pca
pca_vecs = pca.fit_transform(X.toarray())
# save the two dimensions in x0 and x1
x0 = pca_vecs[:, 0]
x1 = pca_vecs[:, 1]
# assign clusters and PCA vectors to columns in the original dataframe
df['cluster'] = clusters
df['x0'] = x0
df['x1'] = x1
cluster_map = {0: "sport", 1: "technology", 2: "religion"} # mapping found through get_top_keywords
df['cluster'] = df['cluster'].map(cluster_map)
# set image size
plt.figure(figsize=(12, 7))
# set title
plt.title("Raggruppamento TF-IDF + KMeans 20newsgroup", fontdict={"fontsize": 18})
# set axes names
plt.xlabel("X0", fontdict={"fontsize": 16})
plt.ylabel("X1", fontdict={"fontsize": 16})
# create scatter plot with seaborn, where hue is the class used to group the data
sns.scatterplot(data=df, x='x0', y='x1', hue='cluster', palette="viridis")
def preprocess_text(text: str, remove_stopwords: bool) -> str:
"""This function cleans the input text by
- removing links
- removing special chars
- removing numbers
- removing stopwords
- transforming in lower case
- removing excessive whitespaces
text (str): text to clean
remove_stopwords (bool): remove stopwords or not
str: cleaned text
# remove links
text = re.sub(r"http\S+", "", text)
# remove numbers and special chars
text = re.sub("[^A-Za-z]+", " ", text)
# remove stopwords
if remove_stopwords:
# 1. creates tokens
tokens = nltk.word_tokenize(text)
# 2. checks if token is a stopword and removes it
tokens = [w for w in tokens if not w.lower() in stopwords.words("english")]
# 3. joins all tokens again
text = " ".join(tokens)
# returns cleaned text
text = text.lower().strip()
return text
def get_top_keywords(n_terms):
"""This function returns the keywords for each centroid of the KMeans"""
df = pd.DataFrame(X.todense()).groupby(clusters).mean() # groups tf idf vector per cluster
terms = vectorizer.get_feature_names_out() # access to tf idf terms
for i,r in df.iterrows():
print('\nCluster {}'.format(i))
print(','.join([terms[t] for t in np.argsort(r)[-n_terms:]])) # for each row of the dataframe, find the n terms that have the highest tf idf score
