Skip to content

Instantly share code, notes, and snippets.

@duhaime
Created November 8, 2017 11:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save duhaime/8937eb61bd1711be7f4a0cf2dedf12c9 to your computer and use it in GitHub Desktop.
Save duhaime/8937eb61bd1711be7f4a0cf2dedf12c9 to your computer and use it in GitHub Desktop.
Non-Negative Matrix Factorization for Topic Modeling
from __future__ import division
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from sklearn import decomposition
import sys, glob, json, codecs, os
import numpy as np
def read_file(path):
with codecs.open(path, 'r', 'utf8') as f:
return f.read()
def build_corpus(infiles):
for i in infiles:
yield read_file(i)
def write_json(filename, obj):
with open(filename, 'w') as out:
json.dump(obj, out)
root_dir = '../text_files/'
n_files = None
n_topics = 250
n_iter = 100
n_top_words = 10
n_features = n_topics * n_top_words * 1000
# Find a list of files to process
infiles = []
keep_matching = True
for root, dirnames, filenames in os.walk(root_dir):
if keep_matching:
for filename in filenames:
infiles.append(os.path.join(root, filename))
if n_files and len(infiles) >= n_files:
keep_matching = False
break
# The algorithmic complexity of NMF is polynomial, but we can limit matrix size with max_features in the vectorizer call
corpus = build_corpus(infiles)
vectorizer = TfidfVectorizer(input='content', stop_words='english', max_df=0.95, min_df=2, max_features=n_features)
tdm = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()
# tdm: (docs x vocab); nmf (topics x vocab); doc_topics (docs x topics)
nmf = decomposition.NMF(n_components=n_topics, random_state=1, max_iter=n_iter)
doc_topics = nmf.fit_transform(tdm)
# Create a 'centered' representation of doc_topics wherein each document's topic distributions integrate to one
centered_doc_topics = doc_topics / np.sum(doc_topics, axis=1, keepdims=True)
# c = doc id, doc_row = distribution over topics
doc_to_topics = defaultdict(lambda: defaultdict())
for c, doc_row in enumerate(centered_doc_topics):
for topic_id, topic_presence_in_doc in enumerate(doc_row):
doc_to_topics[c][topic_id] = topic_presence_in_doc
# get top n words in each topic
topic_to_words = defaultdict(list)
for topic_id, topic in enumerate(nmf.components_):
topic_to_words[topic_id].append( [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]] )
write_json('documents.json', infiles)
write_json('doc_to_topics.json', doc_to_topics)
write_json('topic_to_words.json', topic_to_words)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment