MaartenGr/basic_ctfidf.py

## basic_ctfidf.py
# Get data
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

# Create documents per label
docs = pd.DataFrame({'Document': newsgroups.data, 'Class': newsgroups.target})
docs_per_class = docs.groupby(['Class'], as_index=False).agg({'Document': ' '.join})

# Create c-TF-IDF
count = CountVectorizer().fit_transform(docs_per_class.Document)
ctfidf = CTFIDFVectorizer().fit_transform(count, n_samples=len(docs))
	# Get data
	from sklearn.datasets import fetch_20newsgroups
	newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

	# Create documents per label
	docs = pd.DataFrame({'Document': newsgroups.data, 'Class': newsgroups.target})
	docs_per_class = docs.groupby(['Class'], as_index=False).agg({'Document': ' '.join})

	# Create c-TF-IDF
	count = CountVectorizer().fit_transform(docs_per_class.Document)
	ctfidf = CTFIDFVectorizer().fit_transform(count, n_samples=len(docs))