Skip to content

Instantly share code, notes, and snippets.

@aneesha
aneesha / SiameseBERT_SemanticSearch.ipynb
Last active August 9, 2023 00:48
Semantic Search with Sentence-BERT
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
@aneesha
aneesha / display_closestwords_tsnescatterplot.ipynb
Last active January 31, 2021 20:11
Use TSNE to only plot similar words using Word2Vec
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation
def display_topics(model, feature_names, no_top_words):
for topic_idx, topic in enumerate(model.components_):
print "Topic %d:" % (topic_idx)
print " ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]])
'''
Merge/combine courses in the OpenedX OLX format.
'''
import sys
import os
from distutils.dir_util import copy_tree
import json
# Example:
@aneesha
aneesha / NNDEIG.m
Created June 7, 2014 12:36
Initialization for symmetric NMF using Eigendecomposition based on NNDSVD technique by Boutsidis & Gallopoulos (still experimental as the math may not be correct)
function [W] = NNDEIG(A,k,flag);
%
% This function implements the NNDSVD algorithm described in [1] for
% initialization of Nonnegative Matrix Factorization Algorithms
% for symmetric NMF so uses Eigendecomposition
%
% [W] = nndeig(A,k,flag);
%
% INPUT
% ------------
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
for topic_idx, topic in enumerate(H):
print "Topic %d:" % (topic_idx)
print " ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]])
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np
def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
for topic_idx, topic in enumerate(H):
print "Topic %d:" % (topic_idx)
print " ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]])
top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
import dask.dataframe as dd
df = dd.read_csv('logs/2018-*.*.csv', parse_dates=['timestamp'])
df.groupby(df.timestamp.dt.hour).value.mean().compute()
import h5py
f = h5py.File('myhdf5file.hdf5')
dset = f['/data/path']
import dask.array as da
x = da.from_array(dset, chunks=(5000, 5000))
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.