Created
November 8, 2017 11:43
-
-
Save duhaime/8937eb61bd1711be7f4a0cf2dedf12c9 to your computer and use it in GitHub Desktop.
Non-Negative Matrix Factorization for Topic Modeling
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
from sklearn.metrics.pairwise import cosine_similarity | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from collections import defaultdict | |
from sklearn import decomposition | |
import sys, glob, json, codecs, os | |
import numpy as np | |
def read_file(path): | |
with codecs.open(path, 'r', 'utf8') as f: | |
return f.read() | |
def build_corpus(infiles): | |
for i in infiles: | |
yield read_file(i) | |
def write_json(filename, obj): | |
with open(filename, 'w') as out: | |
json.dump(obj, out) | |
root_dir = '../text_files/' | |
n_files = None | |
n_topics = 250 | |
n_iter = 100 | |
n_top_words = 10 | |
n_features = n_topics * n_top_words * 1000 | |
# Find a list of files to process | |
infiles = [] | |
keep_matching = True | |
for root, dirnames, filenames in os.walk(root_dir): | |
if keep_matching: | |
for filename in filenames: | |
infiles.append(os.path.join(root, filename)) | |
if n_files and len(infiles) >= n_files: | |
keep_matching = False | |
break | |
# The algorithmic complexity of NMF is polynomial, but we can limit matrix size with max_features in the vectorizer call | |
corpus = build_corpus(infiles) | |
vectorizer = TfidfVectorizer(input='content', stop_words='english', max_df=0.95, min_df=2, max_features=n_features) | |
tdm = vectorizer.fit_transform(corpus) | |
feature_names = vectorizer.get_feature_names() | |
# tdm: (docs x vocab); nmf (topics x vocab); doc_topics (docs x topics) | |
nmf = decomposition.NMF(n_components=n_topics, random_state=1, max_iter=n_iter) | |
doc_topics = nmf.fit_transform(tdm) | |
# Create a 'centered' representation of doc_topics wherein each document's topic distributions integrate to one | |
centered_doc_topics = doc_topics / np.sum(doc_topics, axis=1, keepdims=True) | |
# c = doc id, doc_row = distribution over topics | |
doc_to_topics = defaultdict(lambda: defaultdict()) | |
for c, doc_row in enumerate(centered_doc_topics): | |
for topic_id, topic_presence_in_doc in enumerate(doc_row): | |
doc_to_topics[c][topic_id] = topic_presence_in_doc | |
# get top n words in each topic | |
topic_to_words = defaultdict(list) | |
for topic_id, topic in enumerate(nmf.components_): | |
topic_to_words[topic_id].append( [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]] ) | |
write_json('documents.json', infiles) | |
write_json('doc_to_topics.json', doc_to_topics) | |
write_json('topic_to_words.json', topic_to_words) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment