# Get data | |
from sklearn.datasets import fetch_20newsgroups | |
newsgroups = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) | |
# Create documents per label | |
docs = pd.DataFrame({'Document': newsgroups.data, 'Class': newsgroups.target}) | |
docs_per_class = docs.groupby(['Class'], as_index=False).agg({'Document': ' '.join}) | |
# Create c-TF-IDF | |
count = CountVectorizer().fit_transform(docs_per_class.Document) | |
ctfidf = CTFIDFVectorizer().fit_transform(count, n_samples=len(docs)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment