Skip to content

Instantly share code, notes, and snippets.

Created February 20, 2018 09:27
Show Gist options
  • Save anonymous/328e53cba8c1024e579aca18d22ab0b5 to your computer and use it in GitHub Desktop.
Save anonymous/328e53cba8c1024e579aca18d22ab0b5 to your computer and use it in GitHub Desktop.
Honeypot classifier
"""Honeypot classifier. https://www.kaggle.com/mrklees/applying-keras-scikit-learn-to-titanic"""
import pandas as pd
import numpy as np
import seaborn as sns
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from random import randint
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelBinarizer, StandardScaler
FILENAME = '../data/honeypot_dataset.csv'
def encode_one_hot(df, column, axis=1):
"""
:param df:
:param column:
:param axis:
:return:
"""
return df.join(pd.get_dummies(df[column], column)).drop(column, axis=axis)
class HoneypotData(object):
"""Honeypot Data
This class will contain the entire data pipeline from raw data to prepared
numpy arrays. It's eventually inherited by the model class, but is left
distinct for readbility and logical organization.
"""
filepath = '../data/'
train_fn = 'honeypot_dataset.csv'
test_fn = 'honeypot_dataset_test.csv'
def __init__(self):
""" Initializes and process all pipeline."""
self.X_train, self.y_train, self.X_valid, self.y_valid = self.preproc()
def import_and_split_data(self):
"""Import that data and then split it into train/test sets. Make sure to stratify.
This stratify parameter makes a split so that the proportion of values in the sample produced will be the same
as the proportion of values provided to parameter stratify.
For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros
and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's.
"""
X, y = self.select_features(pd.read_csv(self.filepath + self.train_fn))
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=606, stratify=y)
return X_train, y_train, X_valid, y_valid
def select_features(self, data):
"""Selects the features that we'll use in the model. Drops unused features"""
features = ['ruri',
'ruri_user',
'ruri_domain',
'from_user',
'from_domain',
'from_tag',
'to_user',
'contact_user',
'callid',
'content_type',
'user_agent',
'source_ip',
'source_port',
'destination_port',
'contact_ip',
'contact_port']
dropped_features = ['ruri', 'ruri_domain', 'callid', 'from_tag', 'content_type', 'from_user']
target = ['toll_fraud']
X = data[features].drop(dropped_features, axis=1)
y = data[target]
return X, y
def fix_na(self, data):
"""Fill na's with test (in the case of contact_user), and with application/sdp in the case of content_type."""
na_vars = {"contact_user": "test", "content_type": "application/sdp"}
return data.fillna(na_vars)
def encode_one_hot(self, df, column, axis=1):
return df.join(pd.get_dummies(df[column], column)).drop(column, axis=axis)
def create_dummies(self, data, cat_vars, cat_types):
"""Processes categorical data into dummy vars."""
cat_data = data[cat_vars].values
for i in range(len(cat_vars)):
bins = LabelBinarizer().fit_transform(cat_data[:, 0].astype(cat_types[i]))
cat_data = np.delete(cat_data, 0, axis=1)
cat_data = np.column_stack((cat_data, bins))
return cat_data
def preproc(self):
"""Executes the full preprocessing pipeline."""
# Import Data & Split.
X_train_, y_train, X_valid_, y_valid = self.import_and_split_data()
# Fill NAs.
X_train, X_valid = self.fix_na(X_train_), self.fix_na(X_valid_)
# Preproc Categorical Vars
cat_vars = ['ruri_user',
'from_domain',
'to_user',
'contact_user',
'user_agent',
'source_ip',
'contact_ip']
cat_types = ['str', 'str', 'str', 'str', 'str', 'str', 'str']
print 'Before create_dummies'
print X_train.shape[0], X_train.shape[1]
print X_valid.shape[0], X_valid.shape[1]
X_train_cat, X_valid_cat = self.create_dummies(X_train, cat_vars, cat_types), self.create_dummies(X_valid,
cat_vars,
cat_types)
print 'After create_dummies'
print X_train_cat.shape[0], X_train_cat.shape[1]
print X_valid_cat.shape[0], X_valid_cat.shape[1]
X_train, X_valid = X_train_cat, X_valid_cat
print 'After assignment'
print X_train.shape[0], X_train.shape[1]
print X_valid.shape[0], X_valid.shape[1]
return X_train.astype('float32'), y_train.values, X_valid.astype('float32'), y_valid.values
def preproc_test(self):
test = pd.read_csv(self.filepath + self.test_fn)
labels = test.toll_fraud.values
test = self.fix_na(test)
# Preproc Categorical Vars
cat_vars = ['ruri_user',
'from_user',
'from_domain',
'to_user',
'contact_user',
'user_agent',
'source_ip',
'contact_ip']
cat_types = ['str', 'str', 'str', 'str', 'str', 'str', 'str', 'str']
test_cat = self.create_dummies(test, cat_vars, cat_types)
# Recombine
test = test_cat
return labels, test
class HoneypotKeras(HoneypotData):
def __init__(self):
self.X_train, self.y_train, self.X_valid, self.y_valid = self.preproc()
self.y_train, self.y_valid = to_categorical(self.y_train), to_categorical(self.y_valid)
print 'To categorical'
print(self.X_train.shape[0], self.X_train.shape[1])
print(self.y_train.shape[0], self.y_train.shape[1])
print(self.X_valid.shape[0], self.X_valid.shape[1])
print(self.y_valid.shape[0], self.y_valid.shape[1])
self.feature_count = self.X_train.shape[1]
self.history = []
def build_model(self):
model = Sequential()
model.add(Dense(2056, input_shape=(self.feature_count,), activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1028, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1028, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(2, activation='sigmoid'))
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
self.model = model
def fit(self, lr=0.001, epochs=1):
self.model.optimizer.lr = lr
print('Training data:')
print(self.X_train.shape[0], self.X_train.shape[1])
print(self.y_train.shape[0], self.y_train.shape[1])
print(self.X_valid.shape[0], self.X_valid.shape[1])
print(self.y_valid.shape[0], self.y_valid.shape[1])
hist = self.model.fit(self.X_train, self.y_train,
batch_size=32, epochs=epochs,
verbose=1, validation_data=(self.X_valid, self.y_valid),
)
self.history.append(hist)
def prepare_submission(self, name):
labels, test_data = self.preproc_test()
predictions = self.model.predict(test_data)
print predictions
model = HoneypotKeras()
# model.build_model()
# model.fit(lr=0.01, epochs=5)
# model.fit(lr=0.001, epochs=10)
# model.prepare_submission('keras')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment