import pandas as pd
import numpy as np
from collections import Counter
import itertools
import re
from keras.layers import MaxPooling1D, Conv1D, BatchNormalization
from keras.layers import Flatten, Dense, Embedding, Dropout, Dense, SpatialDropout1D
from keras.models import Sequential
from keras.optimizers import Adam, Adamax, RMSprop, SGD
from keras.regularizers import l2
from keras import backend as K
from keras.preprocessing import sequence
# Import data as dataframe
train_filenames = ['Youtube01-Psy.csv', 'Youtube02-KatyPerry.csv', 'Youtube03-LMFAO.csv', 'Youtube04-Eminem.csv']
valid_filename = 'Youtube05-Shakira.csv'
train_df = pd.concat([pd.read_csv('data/' + filename, encoding='utf-8-sig') for filename in train_filenames])
# Get words indexes
def format_phrase(phrase):
words = re.sub("[^\w]", " ", phrase).split()
return [w.replace(" ", "").lower() for w in words]
def get_unique_words(phrases):
words_list = phrases.sum()
return np.unique(np.array(words_list))
def words2idxs(phrase):
words_count = len(word2idx)
return [word2idx[word] if word in word2idx else words_count for word in phrase]
train_df = train_df.assign(CONTENT_WORDS=train_df.CONTENT.apply(format_phrase))
unique_words = get_unique_words(train_df.CONTENT_WORDS)
word2idx = {v: k for k, v in enumerate(unique_words)}
# convert comment to list of words indexes
train_df = train_df.assign(CONTENT_IDX=train_df.CONTENT_WORDS.apply(words2idxs))
maxlen =
train_content_idx = sequence.pad_sequences(train_df.CONTENT_IDX, maxlen=maxlen, value=-1)
# apply same preprocessing on valid_set
valid_df = pd.read_csv('data/' + valid_filename, encoding='utf-8-sig')
valid_df = valid_df.assign(CONTENT_WORDS=valid_df.CONTENT.apply(format_phrase))
valid_df = valid_df.assign(CONTENT_IDX=valid_df.CONTENT_WORDS.apply(words2idxs))
valid_content_idx = sequence.pad_sequences(valid_df.CONTENT_IDX, maxlen=maxlen, value=-1)
#create CNN
vocab_size = len(word2idx) + 1
vgg_model = Sequential([
Embedding(vocab_size, 32, input_length=maxlen, embeddings_regularizer=l2(1e-4)),
# Conv Block 1
Conv1D(64, 5, padding='same', activation='relu'),
Conv1D(64, 3, padding='same', activation='relu'),
# Conv Block 2
Conv1D(128, 3, padding='same', activation='relu'),
Conv1D(128, 3, padding='same', activation='relu'),
# FC layers wiht BatchNorm
Dense(100, activation='relu'),
Dense(100, activation='relu'),
Dense(1, activation='sigmoid')])
vgg_model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy']) = 10e-3, train_df.CLASS, validation_data=(valid_content_idx, valid_df.CLASS),
epochs=10, batch_size=64) = 10e-4, train_df.CLASS, validation_data=(valid_content_idx, valid_df.CLASS),
epochs=40, batch_size=64) = 10e-5, train_df.CLASS, validation_data=(valid_content_idx, valid_df.CLASS),
epochs=40, batch_size=64)
