import pandas as pd | |
import numpy as np | |
from collections import Counter | |
import itertools | |
import re | |
from keras.layers import MaxPooling1D, Conv1D, BatchNormalization | |
from keras.layers import Flatten, Dense, Embedding, Dropout, Dense, SpatialDropout1D | |
from keras.models import Sequential | |
from keras.optimizers import Adam, Adamax, RMSprop, SGD | |
from keras.regularizers import l2 | |
from keras import backend as K | |
from keras.preprocessing import sequence | |
# Import data as dataframe | |
train_filenames = ['Youtube01-Psy.csv', 'Youtube02-KatyPerry.csv', 'Youtube03-LMFAO.csv', 'Youtube04-Eminem.csv'] | |
valid_filename = 'Youtube05-Shakira.csv' | |
train_df = pd.concat([pd.read_csv('data/' + filename, encoding='utf-8-sig') for filename in train_filenames]) | |
# Get words indexes | |
def format_phrase(phrase): | |
words = re.sub("[^\w]", " ", phrase).split() | |
return [w.replace(" ", "").lower() for w in words] | |
def get_unique_words(phrases): | |
words_list = phrases.sum() | |
return np.unique(np.array(words_list)) | |
def words2idxs(phrase): | |
words_count = len(word2idx) | |
return [word2idx[word] if word in word2idx else words_count for word in phrase] | |
train_df = train_df.assign(CONTENT_WORDS=train_df.CONTENT.apply(format_phrase)) | |
unique_words = get_unique_words(train_df.CONTENT_WORDS) | |
word2idx = {v: k for k, v in enumerate(unique_words)} | |
# convert comment to list of words indexes | |
train_df = train_df.assign(CONTENT_IDX=train_df.CONTENT_WORDS.apply(words2idxs)) | |
maxlen = train_df.CONTENT_IDX.map(len).max() | |
train_content_idx = sequence.pad_sequences(train_df.CONTENT_IDX, maxlen=maxlen, value=-1) | |
# apply same preprocessing on valid_set | |
valid_df = pd.read_csv('data/' + valid_filename, encoding='utf-8-sig') | |
valid_df = valid_df.assign(CONTENT_WORDS=valid_df.CONTENT.apply(format_phrase)) | |
valid_df = valid_df.assign(CONTENT_IDX=valid_df.CONTENT_WORDS.apply(words2idxs)) | |
valid_content_idx = sequence.pad_sequences(valid_df.CONTENT_IDX, maxlen=maxlen, value=-1) | |
#create CNN | |
vocab_size = len(word2idx) + 1 | |
vgg_model = Sequential([ | |
Embedding(vocab_size, 32, input_length=maxlen, embeddings_regularizer=l2(1e-4)), | |
# Conv Block 1 | |
Conv1D(64, 5, padding='same', activation='relu'), | |
Conv1D(64, 3, padding='same', activation='relu'), | |
MaxPooling1D(), | |
Dropout(0.4), | |
# Conv Block 2 | |
Conv1D(128, 3, padding='same', activation='relu'), | |
Conv1D(128, 3, padding='same', activation='relu'), | |
MaxPooling1D(), | |
Dropout(0.4), | |
# FC layers wiht BatchNorm | |
Flatten(), | |
Dense(100, activation='relu'), | |
Dropout(0.7), | |
Dense(100, activation='relu'), | |
BatchNormalization(), | |
Dropout(0.7), | |
Dense(1, activation='sigmoid')]) | |
vgg_model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy']) | |
vgg_model.optimizer.lr = 10e-3 | |
vgg_model.fit(train_content_idx, train_df.CLASS, validation_data=(valid_content_idx, valid_df.CLASS), | |
epochs=10, batch_size=64) | |
vgg_model.optimizer.lr = 10e-4 | |
vgg_model.fit(train_content_idx, train_df.CLASS, validation_data=(valid_content_idx, valid_df.CLASS), | |
epochs=40, batch_size=64) | |
vgg_model.optimizer.lr = 10e-5 | |
vgg_model.fit(train_content_idx, train_df.CLASS, validation_data=(valid_content_idx, valid_df.CLASS), | |
epochs=40, batch_size=64) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment