aneesha

## edx_olxformat_merge_courses.py
'''
Merge/combine courses in the OpenedX OLX format.
'''

import sys
import os
from distutils.dir_util import copy_tree
import json

# Example:

## SiameseBERT_SemanticSearch.ipynb

      
              1 file
            
          
              2 forks
            
          
              0 comments
            
          
              1 star
            
          
                aneesha
                / SiameseBERT_SemanticSearch.ipynb
            
            
              Last active
              August 9, 2023 00:48
            
              
                Semantic Search with Sentence-BERT
              
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## dask_delayed_demo.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              1 star
            
          
                aneesha
                / dask_delayed_demo.ipynb
            
            
              Created
              June 6, 2018 09:49
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## dask_array.py
import h5py
f = h5py.File('myhdf5file.hdf5')
dset = f['/data/path']

import dask.array as da
x = da.from_array(dset, chunks=(5000, 5000))

## dask_bag_process_json.py
import dask.bag as db
import json

records = db.read_text('data/2018-*-*.json').map(json.loads)
records.filter(lambda d: d['username'] == 'Aneesha').pluck('id').frequencies()

## dask_loadfiles_pandas.py
import dask.dataframe as dd

df = dd.read_csv('logs/2018-*.*.csv', parse_dates=['timestamp'])
df.groupby(df.timestamp.dt.hour).value.mean().compute()

## wordvector_retrofitting.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                aneesha
                / wordvector_retrofitting.ipynb
            
            
              Last active
              November 29, 2017 12:00
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## retrofitting_blog_4.py
# load the original word vectors and the retrofitted word vectors as separate gensim models

original_glove_model = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.50d.word2vec.txt', binary=False)
retrofitted_glove_model = gensim.models.KeyedVectors.load_word2vec_format('retrofittedglove.word2vec.txt', binary=False)

# display the words closest to 'happy' using the original GLOVE vectors
display_closestwords_tsnescatterplot(original_glove_model, 'happy', 50, 10, "Original Glove Word Vectors - 'Happy'")
# display the words closest to 'happy' using the GLOVE vectors retrofitted with the Paraphrase lexicons
display_closestwords_tsnescatterplot(retrofitted_glove_model, 'happy', 50, 10, "Retroffited Glove Word Vectors - 'Happy'")

## retrofitting_blog_3.py
# git clone https://github.com/mfaruqui/retrofitting.git
# Run retrofit.py with arguments to set the word vectors file, the lexicon file, the number of iterations
# and the output word vectors. The word vectors must be in text format
# Eg:
# python retrofit.py -i word_vec_file -l lexicon_file -n num_iter -o out_vec_file
# python retrofit.py -i /data/glove.6B.50d.txt -l /retrofitting/lexicons/ppdb-xl.txt -n 10 -o retrofittedglove.txt

# Convert txt based GLOVE word vectors to Word2Vec format
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file="/data/glove.6B.50d.txt", word2vec_output_file="glove.6B.50d.word2vec.txt")

## retrofitting_blog_2.py
# Method to plot the top no_similar_words in 2D using TSNE
def display_closestwords_tsnescatterplot(model, word, word_vector_dimension, no_similar_words, plot_title):

    arr = np.empty((0,word_vector_dimension), dtype='f')
    word_labels = [word]

    # get close words
    close_words = model.similar_by_word(word, topn=no_similar_words)

    # add the vector for each of the closest words to the array
	'''
	Merge/combine courses in the OpenedX OLX format.
	'''

	import sys
	import os
	from distutils.dir_util import copy_tree
	import json

	# Example:
	import h5py
	f = h5py.File('myhdf5file.hdf5')
	dset = f['/data/path']

	import dask.array as da
	x = da.from_array(dset, chunks=(5000, 5000))
	import dask.bag as db
	import json

	records = db.read_text('data/2018--.json').map(json.loads)
	records.filter(lambda d: d['username'] == 'Aneesha').pluck('id').frequencies()
	import dask.dataframe as dd

	df = dd.read_csv('logs/2018-..csv', parse_dates=['timestamp'])
	df.groupby(df.timestamp.dt.hour).value.mean().compute()
	# load the original word vectors and the retrofitted word vectors as separate gensim models

	original_glove_model = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.50d.word2vec.txt', binary=False)
	retrofitted_glove_model = gensim.models.KeyedVectors.load_word2vec_format('retrofittedglove.word2vec.txt', binary=False)

	# display the words closest to 'happy' using the original GLOVE vectors
	display_closestwords_tsnescatterplot(original_glove_model, 'happy', 50, 10, "Original Glove Word Vectors - 'Happy'")
	# display the words closest to 'happy' using the GLOVE vectors retrofitted with the Paraphrase lexicons
	display_closestwords_tsnescatterplot(retrofitted_glove_model, 'happy', 50, 10, "Retroffited Glove Word Vectors - 'Happy'")
	# git clone https://github.com/mfaruqui/retrofitting.git
	# Run retrofit.py with arguments to set the word vectors file, the lexicon file, the number of iterations
	# and the output word vectors. The word vectors must be in text format
	# Eg:
	# python retrofit.py -i word_vec_file -l lexicon_file -n num_iter -o out_vec_file
	# python retrofit.py -i /data/glove.6B.50d.txt -l /retrofitting/lexicons/ppdb-xl.txt -n 10 -o retrofittedglove.txt

	# Convert txt based GLOVE word vectors to Word2Vec format
	from gensim.scripts.glove2word2vec import glove2word2vec
	glove2word2vec(glove_input_file="/data/glove.6B.50d.txt", word2vec_output_file="glove.6B.50d.word2vec.txt")
	# Method to plot the top no_similar_words in 2D using TSNE
	def display_closestwords_tsnescatterplot(model, word, word_vector_dimension, no_similar_words, plot_title):

	arr = np.empty((0,word_vector_dimension), dtype='f')
	word_labels = [word]

	# get close words
	close_words = model.similar_by_word(word, topn=no_similar_words)

	# add the vector for each of the closest words to the array