Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.pairwise import cosine_similarity
# Get train data
train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
docs = pd.DataFrame({'Document':, 'Class':})
docs_per_class = docs.groupby(['Class'], as_index=False).agg({'Document': ' '.join})
# Create c-TF-IDF based on the train data
count_vectorizer = CountVectorizer().fit(docs_per_class.Document)
count = count_vectorizer.transform(docs_per_class.Document)
ctfidf_vectorizer = CTFIDFVectorizer().fit(count, n_samples=len(docs))
ctfidf = ctfidf_vectorizer.transform(count)
# Predict test data
test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
count = count_vectorizer.transform(
vector = ctfidf_vectorizer.transform(count)
distances = cosine_similarity(vector, ctfidf)
prediction = np.argmax(distances, 1)
print(metrics.classification_report(, prediction, target_names=test.target_names))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment