This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import gc | |
import csv | |
import numpy as np # linear algebra | |
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) | |
from nltk import TweetTokenizer | |
from nltk.corpus import stopwords | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from keras.preprocessing.text import Tokenizer | |
from keras.preprocessing.sequence import pad_sequences |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
with open (f, "r+") as myfile: | |
s=myfile.read() | |
ret = re.sub(';',' ', s) | |
ret = re.sub(',1\n',';1\n', s) | |
ret = re.sub(',0\n',';0\n', s) | |
myfile.write(ret) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
train = pd.read_csv("../input/question2/train.csv",sep=';') | |
test = pd.read_csv("../input/question2/test.csv",sep=';',quoting=csv.QUOTE_NONE) | |
del test['Unnamed: 1'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
train['tweet'] = train['tweet'].apply(lambda x : ' '.join([w for w in x.split() if not w.startswith('@') ]) ) | |
test['tweet'] = test['tweet'].apply(lambda x : ' '.join([w for w in x.split() if not w.startswith('@') ]) ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
full_text = list(train['tweet'].values) + list(test['tweet'].values) | |
full_text = [i.lower() for i in full_text if i not in stopwords.words('english') and i not in ['.',',','/','@','"','&','<br />','+/-','zzzzzzzzzzzzzzzzz',':-D',':D',':P',':)','!',';']] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
y = train['sentiment'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tk = Tokenizer(lower = True, filters='') | |
tk.fit_on_texts(full_text) | |
train_tokenized = tk.texts_to_sequences(train['tweet']) | |
test_tokenized = tk.texts_to_sequences(test['tweet']) | |
max_len = 50 | |
X_train = pad_sequences(train_tokenized, maxlen = max_len) | |
X_test = pad_sequences(test_tokenized, maxlen = max_len) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
embedding_path = "path-to-word-embedding" | |
embed_size = "dimension of embedding vectors" | |
max_features = 30000 | |
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32') | |
embedding_index = dict(get_coefs(*o.strip().split(" ")) for o in open(embedding_path)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
word_index = tk.word_index | |
nb_words = min(max_features, len(word_index)) | |
embedding_matrix = np.zeros((nb_words + 1, embed_size)) | |
for word, i in word_index.items(): | |
if i >= max_features: continue | |
embedding_vector = embedding_index.get(word) | |
if embedding_vector is not None: embedding_matrix[i] = embedding_vector |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def build_model1(lr=0.0, lr_d=0.0, units=0, spatial_dr=0.0, kernel_size1=3, kernel_size2=2, dense_units=128, dr=0.1, conv_size=32): | |
file_path = "best_model.hdf5" | |
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1, | |
save_best_only = True, mode = "min") | |
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3) | |
inp = Input(shape = (max_len,)) | |
x = Embedding(30001, embed_size, weights = [embedding_matrix], trainable = False)(inp) | |
x1 = SpatialDropout1D(spatial_dr)(x) |
OlderNewer