Skip to content

Instantly share code, notes, and snippets.

from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
no_features = 1000
# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
from sklearn.decomposition import NMF, LatentDirichletAllocation
no_topics = 20
# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
def display_topics(model, feature_names, no_top_words):
for topic_idx, topic in enumerate(model.components_):
print "Topic %d:" % (topic_idx)
print " ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]])
no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
for topic_idx, topic in enumerate(H):
print "Topic %d:" % (topic_idx)
print " ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]])
top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
for doc_index in top_doc_indices:
print documents[doc_index]
# uncomment if gensim is installed
#!pip install gensim
import gensim
# Need the interactive Tools for Matplotlib
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
# git clone https://github.com/mfaruqui/retrofitting.git
# Run retrofit.py with arguments to set the word vectors file, the lexicon file, the number of iterations
# and the output word vectors. The word vectors must be in text format
# Eg:
# python retrofit.py -i word_vec_file -l lexicon_file -n num_iter -o out_vec_file
# python retrofit.py -i /data/glove.6B.50d.txt -l /retrofitting/lexicons/ppdb-xl.txt -n 10 -o retrofittedglove.txt
# Convert txt based GLOVE word vectors to Word2Vec format
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file="/data/glove.6B.50d.txt", word2vec_output_file="glove.6B.50d.word2vec.txt")
# load the original word vectors and the retrofitted word vectors as separate gensim models
original_glove_model = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.50d.word2vec.txt', binary=False)
retrofitted_glove_model = gensim.models.KeyedVectors.load_word2vec_format('retrofittedglove.word2vec.txt', binary=False)
# display the words closest to 'happy' using the original GLOVE vectors
display_closestwords_tsnescatterplot(original_glove_model, 'happy', 50, 10, "Original Glove Word Vectors - 'Happy'")
# display the words closest to 'happy' using the GLOVE vectors retrofitted with the Paraphrase lexicons
display_closestwords_tsnescatterplot(retrofitted_glove_model, 'happy', 50, 10, "Retroffited Glove Word Vectors - 'Happy'")
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import dask.bag as db
import json
records = db.read_text('data/2018-*-*.json').map(json.loads)
records.filter(lambda d: d['username'] == 'Aneesha').pluck('id').frequencies()