Skip to content

Instantly share code, notes, and snippets.

View andrea-dagostino's full-sized avatar

Andrea D'Agostino andrea-dagostino

View GitHub Profile
def fuzzy_tagging(tags, articles):
"""
Questa funzione riceve in input una lista di tag predefiniti e la lista di contenuto testuale da taggare.
Restituisce un dataframe Pandas con gli articoli taggati
"""
results = []
# ciclo nei tag
for i, tag in enumerate(tags):
d = {}
ranking = process.extract(tag, articles, limit=4)
# carichiamo un dataset e isoliamo i post
df = pd.read_csv('dataset.csv')
posts = df[df.url.str.contains('post')]
posts.reset_index(inplace=True, drop=True)
articles = list(posts.article)
# queste sono i tag che vogliamo applicare ai nostri documenti.
# cambiate questa lista a vostra discrezione
tags = [
"machine learning",
"clustering",
"carriera",
"progetto",
"consigli",
"analytics",
"deep learning",
from thefuzz import process
import pandas as pd
top = similarity_df[similarity_df > 0.4] # change this
mask = np.triu(np.ones_like(top))
# let's create the viz
plt.figure(figsize=(12, 12))
sns.heatmap(
top,
square=True,
annot=True,
robust=True,
labels = posts.url.str.split('/').str[3:].str[1] # we extract the titles of the articles from the url
similarity_df = pd.DataFrame(M, columns=labels, index=labels) # let's create the dataframe
mask = np.triu(np.ones_like(similarity_df)) # we apply a mask to remove the top of the heatmap
# let's create the viz
plt.figure(figsize=(12, 12))
sns.heatmap(
similarity_df,
square=True,
annot=True,
M = np.zeros((posts.shape[0], posts.shape[0])) # we create a 30x30 matrix to contain the results of article_i with article_j
for i, row in tqdm(posts.iterrows(), total=posts.shape[0], desc='1st level'): # we define i
for j, next_row in posts.iterrows(): # we define j
M[i, j] = compute_similarity(row.article, next_row.article) # we populate the matrix with the results
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
top = similarity_df[similarity_df > 0.4] # andiamo a modificare qui
mask = np.triu(np.ones_like(top))
sns.heatmap(
top,
square=True,
annot=True,
robust=True,
fmt='.2f',
labels = posts.url.str.split('/').str[3:].str[1] # estraiamo i titoli degli articoli dalle url
similarity_df = pd.DataFrame(M, columns=labels, index=labels) # creiamo un dataframe
mask = np.triu(np.ones_like(similarity_df)) # applichiamo una maschera per rimuovere la parte superiore della heatmap
# creiamo la visualizzazione
plt.figure(figsize=(12, 12))
sns.heatmap(
similarity_df,
square=True,
annot=True,