Skip to content

Instantly share code, notes, and snippets.

View arshjat's full-sized avatar
🏠
Working from home

Arsh Panghal arshjat

🏠
Working from home
View GitHub Profile
@arshjat
arshjat / import1.py
Last active January 1, 2019 10:15
Sentiment Analysis using Deep RNN, GloVe twitter word embeddings and Keras.
import os
import gc
import csv
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from nltk import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
with open (f, "r+") as myfile:
s=myfile.read()
ret = re.sub(';',' ', s)
ret = re.sub(',1\n',';1\n', s)
ret = re.sub(',0\n',';0\n', s)
myfile.write(ret)
train = pd.read_csv("../input/question2/train.csv",sep=';')
test = pd.read_csv("../input/question2/test.csv",sep=';',quoting=csv.QUOTE_NONE)
del test['Unnamed: 1']
train['tweet'] = train['tweet'].apply(lambda x : ' '.join([w for w in x.split() if not w.startswith('@') ]) )
test['tweet'] = test['tweet'].apply(lambda x : ' '.join([w for w in x.split() if not w.startswith('@') ]) )
y = train['sentiment']
tk = Tokenizer(lower = True, filters='')
tk.fit_on_texts(full_text)
train_tokenized = tk.texts_to_sequences(train['tweet'])
test_tokenized = tk.texts_to_sequences(test['tweet'])
max_len = 50
X_train = pad_sequences(train_tokenized, maxlen = max_len)
X_test = pad_sequences(test_tokenized, maxlen = max_len)
@arshjat
arshjat / 1.5.py
Last active January 1, 2019 10:41
full_text = list(train['tweet'].values) + list(test['tweet'].values)
full_text = [i.lower() for i in full_text if i not in stopwords.words('english') and i not in ['.',',','/','@','"','&amp','<br />','+/-','zzzzzzzzzzzzzzzzz',':-D',':D',':P',':)','!',';']]
embedding_path = "path-to-word-embedding"
embed_size = "dimension of embedding vectors"
max_features = 30000
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path))
word_index = tk.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, embed_size))
for word, i in word_index.items():
if i >= max_features: continue
embedding_vector = embedding_index.get(word)
if embedding_vector is not None: embedding_matrix[i] = embedding_vector
model = build_model1(lr = 1e-3, lr_d = 1e-10, units = 128, spatial_dr = 0.5, kernel_size1=4, kernel_size2=4, dense_units=64, dr=0.2, conv_size=32)