Skip to content

Instantly share code, notes, and snippets.

@andrea-dagostino
Created October 3, 2022 17:59
Show Gist options
  • Save andrea-dagostino/06633531f472de3d47e41a523e7b3de2 to your computer and use it in GitHub Desktop.
Save andrea-dagostino/06633531f472de3d47e41a523e7b3de2 to your computer and use it in GitHub Desktop.
text_sim_tfidf
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
###
Ricordiamo di creare il dataset con lo script presente qui
https://www.diariodiunanalista.it/posts/come-scraperare-un-blog-e-raccogliere-i-suoi-articoli
###
posts = df[df.url.str.contains('post')]
posts.reset_index(inplace=True)
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
ita_stopwords = stopwords.words('italian')
def preprocess(text):
return nltk.word_tokenize(text.lower().translate(remove_punctuation_map))
vectorizer = TfidfVectorizer(tokenizer=preprocess, stop_words=ita_stopwords)
def compute_similarity(a, b):
tfidf = vectorizer.fit_transform([a, b])
return ((tfidf * tfidf.T).toarray())[0,1]
M = np.zeros((posts.shape[0], posts.shape[0]))
for i, row in tqdm(posts.iterrows(), total=posts.shape[0], desc='1st level'):
for j, next_row in posts.iterrows():
M[i, j] = compute_similarity(row.article, next_row.article)
labels = posts.url.str.split('/').str[3:].str[1]
similarity_df = pd.DataFrame(M, columns=labels, index=labels)
mask = np.triu(np.ones_like(similarity_df))
plt.figure(figsize=(12, 12))
sns.heatmap(
similarity_df,
square=True,
annot=True,
robust=True,
fmt='.2f',
annot_kws={'size': 7, 'fontweight': 'bold'},
yticklabels=similarity_df.columns
xticklabels=similarity_df.columns,
cmap="YlGnBu",
mask=mask
)
plt.title('Heatmap delle similarità tra testi', fontdict={'fontsize': 24})
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment