Skip to content

Instantly share code, notes, and snippets.

@hhl60492
Created July 8, 2017 00:27
Show Gist options
  • Save hhl60492/176359706b5534604f7215e6f63a0a10 to your computer and use it in GitHub Desktop.
Save hhl60492/176359706b5534604f7215e6f63a0a10 to your computer and use it in GitHub Desktop.
from process import *
import pandas as pd
import glob
import numpy as np
from keras.models import Sequential
from keras.layers import Conv2D, Flatten, MaxPool2D, Dense, Dropout
from random import shuffle
MAX_CHAR_LEN = 140
TRAIN_RATIO = 0.9
DENSE_NUM = 256
EPOCHS = 5
### read in multiple csvs and concat to one df
path =r'C:\spam' # path of csv files
allFiles = glob.glob(path + "/*.csv")
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
df = pd.read_csv(file_,index_col=None, header=0)
list_.append(df)
frame = pd.concat(list_)
#print(frame.head())
train_set = []
test_set = []
stop = int(TRAIN_RATIO*len(frame))
### split into training and test sets (hold out CV)
for i in range(stop):
train_set.append([frame.iloc[i]['CONTENT'], frame.iloc[i]['CLASS']])
for i in range(len(frame) - stop):
test_set.append([frame.iloc[i]['CONTENT'], frame.iloc[i]['CLASS']])
# convert all strings to lower case
train_set = to_lower(train_set)
test_set = to_lower(test_set)
# remove punctuation from strings
train_set = remove_punc(train_set)
test_set = remove_punc(test_set)
### build the nnet model
model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(MAX_CHAR_LEN, 32, 1)))
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(DENSE_NUM, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='mean_squared_error',
optimizer='rmsprop',
metrics=['accuracy'])
### training and validation loop
for i in range(EPOCHS):
shuffle(train_set)
for j in range(len(train_set)):
seq_in = convert_to_sequence(train_set[j][0],MAX_CHAR_LEN)
resp_in = np.asarray(train_set[j][1])
seq_in = np.reshape(seq_in, (1, MAX_CHAR_LEN, 32, 1))
resp_in = np.reshape(resp_in, (1,1))
model.fit(seq_in, resp_in, batch_size=1, epochs=1, shuffle=False)
print("Epoch " + str(i) + " Iteration " + str(j))
# do validation
correct_class = 0
for j in range(len(test_set)):
seq_in = convert_to_sequence(test_set[j][0], MAX_CHAR_LEN)
resp_in = np.asarray(test_set[j][1])
seq_in = np.reshape(seq_in, (1, MAX_CHAR_LEN, 32, 1))
pred = model.predict(seq_in, batch_size=1)
pred = pred[0][0]
print(str(resp_in) + "|" + str(pred) + " " + str(test_set[j][0]))
if(pred < 0.5):
pred = 0
else:
pred = 1
if(pred == resp_in):
correct_class = correct_class + 1
print("Test classification accuracy: " + str(float(correct_class / len(test_set))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment