Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.pairwise import cosine_similarity
# Get train data
train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
docs = pd.DataFrame({'Document': train.data, 'Class': train.target})
docs_per_class = docs.groupby(['Class'], as_index=False).agg({'Document': ' '.join})
# Create c-TF-IDF based on the train data
count_vectorizer = CountVectorizer().fit(docs_per_class.Document)
count = count_vectorizer.transform(docs_per_class.Document)
ctfidf_vectorizer = CTFIDFVectorizer().fit(count, n_samples=len(docs))
ctfidf = ctfidf_vectorizer.transform(count)
# Predict test data
test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))
count = count_vectorizer.transform(test.data)
vector = ctfidf_vectorizer.transform(count)
distances = cosine_similarity(vector, ctfidf)
prediction = np.argmax(distances, 1)
print(metrics.classification_report(test.target, prediction, target_names=test.target_names))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment