Skip to content

Instantly share code, notes, and snippets.

@andrea-dagostino
Last active November 23, 2021 17:48
Show Gist options
  • Save andrea-dagostino/00849c831e5c1f4b59232fcec2665c46 to your computer and use it in GitHub Desktop.
Save andrea-dagostino/00849c831e5c1f4b59232fcec2665c46 to your computer and use it in GitHub Desktop.
posts/raggruppamento-testuale-con-tf-idf
# import the dataset from sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
# import other required libs
import pandas as pd
import numpy as np
# string manipulation libs
import re
import string
import nltk
from nltk.corpus import stopwords
# viz libs
import matplotlib.pyplot as plt
import seaborn as sns
categories = [
'comp.graphics',
'comp.os.ms-windows.misc',
'rec.sport.baseball',
'rec.sport.hockey',
'alt.atheism',
'soc.religion.christian',
]
dataset = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, remove=('headers', 'footers', 'quotes'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment