Skip to content

Instantly share code, notes, and snippets.

@aneesha
aneesha / NNDEIG.m
Created June 7, 2014 12:36
Initialization for symmetric NMF using Eigendecomposition based on NNDSVD technique by Boutsidis & Gallopoulos (still experimental as the math may not be correct)
function [W] = NNDEIG(A,k,flag);
%
% This function implements the NNDSVD algorithm described in [1] for
% initialization of Nonnegative Matrix Factorization Algorithms
% for symmetric NMF so uses Eigendecomposition
%
% [W] = nndeig(A,k,flag);
%
% INPUT
% ------------
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
no_features = 1000
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
from sklearn.decomposition import NMF, LatentDirichletAllocation
no_topics = 20
# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
def display_topics(model, feature_names, no_top_words):
for topic_idx, topic in enumerate(model.components_):
print "Topic %d:" % (topic_idx)
print " ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]])
no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation
def display_topics(model, feature_names, no_top_words):
for topic_idx, topic in enumerate(model.components_):
print "Topic %d:" % (topic_idx)
print " ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]])
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
for topic_idx, topic in enumerate(H):
print "Topic %d:" % (topic_idx)
print " ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]])
top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
for doc_index in top_doc_indices:
print documents[doc_index]
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
for topic_idx, topic in enumerate(H):
print "Topic %d:" % (topic_idx)
print " ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]])
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
for topic_idx, topic in enumerate(H):
print "Topic %d:" % (topic_idx)
print " ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]])
top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
@aneesha
aneesha / display_closestwords_tsnescatterplot.ipynb
Last active January 31, 2021 20:11
Use TSNE to only plot similar words using Word2Vec
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.