Skip to content

Instantly share code, notes, and snippets.

@roopalgarg
roopalgarg / gensim_bigram_word_emb.txt
Last active November 20, 2017 23:50
Word Embeddings for bigrams using gensim
from gensim.models import word2vec
bigram_to_search = "hello_there"
def bigram2vec(unigrams, bigram_to_search):
bigrams = Phrases(unigrams)
model = word2vec.Word2Vec(bigrams[unigrams])
if bigram_to_search in model.vocab.keys():
return model[bigram_to_search]
else:
@roopalgarg
roopalgarg / nltk_stanford_nlp.py
Last active May 25, 2017 00:40
using stanford parsers with nltk
import os
from nltk.parse.stanford import StanfordParser, StanfordDependencyParser
os.environ["CLASSPATH"]= "/usr/local/stanford-models/stanford-postagger-full-2016-10-31/:usr/local/stanford-models/stanford-ner-2016-10-31/:/usr/local/stanford-models/stanford-parser-full-2016-10-31/"
os.environ["STANFORD_MODELS"]= "/usr/local/stanford-models/stanford-postagger-full-2016-10-31/models:/usr/local/stanford-models/stanford-ner-2016-10-31/classifiers"
stan_parser = StanfordParser()
stan_dep_parser = StanfordDependencyParser()
sents = ["The Mavericks won against the Jets", "Golden State Warriors thrashed LA Lakers"]
@roopalgarg
roopalgarg / nltk twitter corpus
Created May 24, 2017 21:19
access nltk's twitter corpus
from nltk.corpus import twitter_samples
from nltk.twitter.util import json2csv
twitter_samples.fileids()
strings = twitter_samples.strings('negative_tweets.json')
for string in strings[:15]:
print(string)
@roopalgarg
roopalgarg / strip_accent.py
Created May 23, 2017 23:18
stripping unicode or ascii accent
"""
these are functions from within the sklearn module
"""
def strip_accents_unicode(s):
"""Transform accentuated unicode symbols into their simple counterpart
Warning: the python-level loop and join operations make this
implementation 20 times slower than the strip_accents_ascii basic
normalization.
@roopalgarg
roopalgarg / pandas_df_to_excel
Created May 17, 2017 00:31
Pandas: write dataframe to excel sheet
writer = pd.ExcelWriter('filename.xlsx', engine='xlsxwriter')
# Convert the dataframe to an XlsxWriter Excel object.
df_test.to_excel(writer, sheet_name='Sheet1')
writer.save()
@roopalgarg
roopalgarg / deaccent
Created April 21, 2017 01:09
deaccent in python
def deaccent(text):
"""
Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.
Return input string with accents removed, as unicode.
>>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
u'Sef chomutovskych komunistu dostal postou bily prasek'
"""
@roopalgarg
roopalgarg / glove_load_vectors.py
Last active March 15, 2017 23:52
Loading glove vectors
#load glove vectors
#download them from http://nlp.stanford.edu/data/glove.6B.zip
embeddings_index = {}
GLOVE_DIR = 'glove.6B'
import os
f = open(os.path.join(GLOVE_DIR, 'glove.6B.%id.txt' %EMBEDDING_DIM))
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
@roopalgarg
roopalgarg / tensorflow-debug-gradients.py
Last active July 23, 2019 10:39
tensorflow: debug gradients
with tf.name_scope("train_op"):
trainables = tf.trainable_variables()
print "trainables", trainables
for train_obj in trainables:
print train_obj.name, train_obj.get_shape()
print "grad", tf.gradients(self.loss, [train_obj])
grads = tf.gradients(self.loss, trainables)
@roopalgarg
roopalgarg / tensorflow-add-noise.py
Created March 11, 2017 08:04
tensorflow: add noise
import tensorflow as tf
import numpy as np
def gaussian_noise_layer(input_layer, std):
noise = tf.random_normal(shape=tf.shape(input_layer), mean=0.0, stddev=std, dtype=tf.float32)
return input_layer + noise
inp = tf.placeholder(tf.float32, shape=[None, 8], name='input')