/honeypot_predictor.py

## honeypot_predictor.py
"""Honeypot classifier. https://www.kaggle.com/mrklees/applying-keras-scikit-learn-to-titanic"""

import pandas as pd
import numpy as np
import seaborn as sns

from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout

from sklearn.model_selection import train_test_split

from random import randint
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelBinarizer, StandardScaler

FILENAME = '../data/honeypot_dataset.csv'


def encode_one_hot(df, column, axis=1):
    """

    :param df:
    :param column:
    :param axis:
    :return:
    """
    return df.join(pd.get_dummies(df[column], column)).drop(column, axis=axis)


class HoneypotData(object):
    """Honeypot Data

    This class will contain the entire data pipeline from raw data to prepared
    numpy arrays.  It's eventually inherited by the model class, but is left
    distinct for readbility and logical organization.
    """
    filepath = '../data/'
    train_fn = 'honeypot_dataset.csv'
    test_fn = 'honeypot_dataset_test.csv'

    def __init__(self):
        """ Initializes and process all pipeline."""
        self.X_train, self.y_train, self.X_valid, self.y_valid = self.preproc()

    def import_and_split_data(self):
        """Import that data and then split it into train/test sets. Make sure to stratify.

        This stratify parameter makes a split so that the proportion of values in the sample produced will be the same
        as the proportion of values provided to parameter stratify.
        For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros
        and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's.
        """

        X, y = self.select_features(pd.read_csv(self.filepath + self.train_fn))
        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=606, stratify=y)
        return X_train, y_train, X_valid, y_valid

    def select_features(self, data):
        """Selects the features that we'll use in the model. Drops unused features"""
        features = ['ruri',
                    'ruri_user',
                    'ruri_domain',
                    'from_user',
                    'from_domain',
                    'from_tag',
                    'to_user',
                    'contact_user',
                    'callid',
                    'content_type',
                    'user_agent',
                    'source_ip',
                    'source_port',
                    'destination_port',
                    'contact_ip',
                    'contact_port']
        dropped_features = ['ruri', 'ruri_domain', 'callid', 'from_tag', 'content_type', 'from_user']
        target = ['toll_fraud']
        X = data[features].drop(dropped_features, axis=1)
        y = data[target]
        return X, y

    def fix_na(self, data):
        """Fill na's with test (in the case of contact_user), and with application/sdp in the case of content_type."""
        na_vars = {"contact_user": "test", "content_type": "application/sdp"}
        return data.fillna(na_vars)

    def encode_one_hot(self, df, column, axis=1):
        return df.join(pd.get_dummies(df[column], column)).drop(column, axis=axis)

    def create_dummies(self, data, cat_vars, cat_types):
        """Processes categorical data into dummy vars."""

        cat_data = data[cat_vars].values
        for i in range(len(cat_vars)):
            bins = LabelBinarizer().fit_transform(cat_data[:, 0].astype(cat_types[i]))
            cat_data = np.delete(cat_data, 0, axis=1)
            cat_data = np.column_stack((cat_data, bins))
        return cat_data

    def preproc(self):
        """Executes the full preprocessing pipeline."""

        # Import Data & Split.
        X_train_, y_train, X_valid_, y_valid = self.import_and_split_data()
        # Fill NAs.
        X_train, X_valid = self.fix_na(X_train_), self.fix_na(X_valid_)
        # Preproc Categorical Vars
        cat_vars = ['ruri_user',
                    'from_domain',
                    'to_user',
                    'contact_user',
                    'user_agent',
                    'source_ip',
                    'contact_ip']

        cat_types = ['str', 'str', 'str', 'str', 'str', 'str', 'str']
        print 'Before create_dummies'
        print X_train.shape[0], X_train.shape[1]
        print X_valid.shape[0], X_valid.shape[1]

        X_train_cat, X_valid_cat = self.create_dummies(X_train, cat_vars, cat_types), self.create_dummies(X_valid,
                                                                                                          cat_vars,
                                                                                                          cat_types)

        print 'After create_dummies'
        print X_train_cat.shape[0], X_train_cat.shape[1]
        print X_valid_cat.shape[0], X_valid_cat.shape[1]

        X_train, X_valid = X_train_cat, X_valid_cat
        print 'After assignment'
        print X_train.shape[0], X_train.shape[1]
        print X_valid.shape[0], X_valid.shape[1]

        return X_train.astype('float32'), y_train.values, X_valid.astype('float32'), y_valid.values

    def preproc_test(self):
        test = pd.read_csv(self.filepath + self.test_fn)
        labels = test.toll_fraud.values
        test = self.fix_na(test)
        # Preproc Categorical Vars
        cat_vars = ['ruri_user',
                    'from_user',
                    'from_domain',
                    'to_user',
                    'contact_user',
                    'user_agent',
                    'source_ip',
                    'contact_ip']

        cat_types = ['str', 'str', 'str', 'str', 'str', 'str', 'str', 'str']
        test_cat = self.create_dummies(test, cat_vars, cat_types)

        # Recombine
        test = test_cat
        return labels, test


class HoneypotKeras(HoneypotData):
    def __init__(self):
        self.X_train, self.y_train, self.X_valid, self.y_valid = self.preproc()
        self.y_train, self.y_valid = to_categorical(self.y_train), to_categorical(self.y_valid)
        print 'To categorical'
        print(self.X_train.shape[0], self.X_train.shape[1])
        print(self.y_train.shape[0], self.y_train.shape[1])
        print(self.X_valid.shape[0], self.X_valid.shape[1])
        print(self.y_valid.shape[0], self.y_valid.shape[1])
        self.feature_count = self.X_train.shape[1]
        self.history = []

    def build_model(self):
        model = Sequential()
        model.add(Dense(2056, input_shape=(self.feature_count,), activation='relu'))
        model.add(Dropout(0.1))
        model.add(Dense(1028, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(1028, activation='relu'))
        model.add(Dropout(0.3))
        model.add(Dense(512, activation='relu'))
        model.add(Dropout(0.4))
        model.add(Dense(2, activation='sigmoid'))
        model.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        self.model = model

    def fit(self, lr=0.001, epochs=1):
        self.model.optimizer.lr = lr
        print('Training data:')
        print(self.X_train.shape[0], self.X_train.shape[1])
        print(self.y_train.shape[0], self.y_train.shape[1])
        print(self.X_valid.shape[0], self.X_valid.shape[1])
        print(self.y_valid.shape[0], self.y_valid.shape[1])

        hist = self.model.fit(self.X_train, self.y_train,
                              batch_size=32, epochs=epochs,
                              verbose=1, validation_data=(self.X_valid, self.y_valid),
                              )
        self.history.append(hist)

    def prepare_submission(self, name):
        labels, test_data = self.preproc_test()
        predictions = self.model.predict(test_data)
        print predictions


model = HoneypotKeras()
# model.build_model()
# model.fit(lr=0.01, epochs=5)
# model.fit(lr=0.001, epochs=10)
# model.prepare_submission('keras')
	"""Honeypot classifier. https://www.kaggle.com/mrklees/applying-keras-scikit-learn-to-titanic"""

	import pandas as pd
	import numpy as np
	import seaborn as sns

	from keras.utils import to_categorical
	from keras.models import Sequential
	from keras.layers import Dense, Dropout

	from sklearn.model_selection import train_test_split

	from random import randint
	from sklearn.neighbors import KNeighborsRegressor
	from sklearn.neural_network import MLPClassifier
	from sklearn.ensemble import VotingClassifier, BaggingClassifier
	from sklearn.model_selection import GridSearchCV
	from sklearn.preprocessing import LabelBinarizer, StandardScaler

	FILENAME = '../data/honeypot_dataset.csv'


	def encode_one_hot(df, column, axis=1):
	"""

	:param df:
	:param column:
	:param axis:
	:return:
	"""
	return df.join(pd.get_dummies(df[column], column)).drop(column, axis=axis)


	class HoneypotData(object):
	"""Honeypot Data

	This class will contain the entire data pipeline from raw data to prepared
	numpy arrays. It's eventually inherited by the model class, but is left
	distinct for readbility and logical organization.
	"""
	filepath = '../data/'
	train_fn = 'honeypot_dataset.csv'
	test_fn = 'honeypot_dataset_test.csv'

	def __init__(self):
	""" Initializes and process all pipeline."""
	self.X_train, self.y_train, self.X_valid, self.y_valid = self.preproc()

	def import_and_split_data(self):
	"""Import that data and then split it into train/test sets. Make sure to stratify.

	This stratify parameter makes a split so that the proportion of values in the sample produced will be the same
	as the proportion of values provided to parameter stratify.
	For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros
	and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's.
	"""

	X, y = self.select_features(pd.read_csv(self.filepath + self.train_fn))
	X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=606, stratify=y)
	return X_train, y_train, X_valid, y_valid

	def select_features(self, data):
	"""Selects the features that we'll use in the model. Drops unused features"""
	features = ['ruri',
	'ruri_user',
	'ruri_domain',
	'from_user',
	'from_domain',
	'from_tag',
	'to_user',
	'contact_user',
	'callid',
	'content_type',
	'user_agent',
	'source_ip',
	'source_port',
	'destination_port',
	'contact_ip',
	'contact_port']
	dropped_features = ['ruri', 'ruri_domain', 'callid', 'from_tag', 'content_type', 'from_user']
	target = ['toll_fraud']
	X = data[features].drop(dropped_features, axis=1)
	y = data[target]
	return X, y

	def fix_na(self, data):
	"""Fill na's with test (in the case of contact_user), and with application/sdp in the case of content_type."""
	na_vars = {"contact_user": "test", "content_type": "application/sdp"}
	return data.fillna(na_vars)

	def encode_one_hot(self, df, column, axis=1):
	return df.join(pd.get_dummies(df[column], column)).drop(column, axis=axis)

	def create_dummies(self, data, cat_vars, cat_types):
	"""Processes categorical data into dummy vars."""

	cat_data = data[cat_vars].values
	for i in range(len(cat_vars)):
	bins = LabelBinarizer().fit_transform(cat_data[:, 0].astype(cat_types[i]))
	cat_data = np.delete(cat_data, 0, axis=1)
	cat_data = np.column_stack((cat_data, bins))
	return cat_data

	def preproc(self):
	"""Executes the full preprocessing pipeline."""

	# Import Data & Split.
	X_train_, y_train, X_valid_, y_valid = self.import_and_split_data()
	# Fill NAs.
	X_train, X_valid = self.fix_na(X_train_), self.fix_na(X_valid_)
	# Preproc Categorical Vars
	cat_vars = ['ruri_user',
	'from_domain',
	'to_user',
	'contact_user',
	'user_agent',
	'source_ip',
	'contact_ip']

	cat_types = ['str', 'str', 'str', 'str', 'str', 'str', 'str']
	print 'Before create_dummies'
	print X_train.shape[0], X_train.shape[1]
	print X_valid.shape[0], X_valid.shape[1]

	X_train_cat, X_valid_cat = self.create_dummies(X_train, cat_vars, cat_types), self.create_dummies(X_valid,
	cat_vars,
	cat_types)

	print 'After create_dummies'
	print X_train_cat.shape[0], X_train_cat.shape[1]
	print X_valid_cat.shape[0], X_valid_cat.shape[1]

	X_train, X_valid = X_train_cat, X_valid_cat
	print 'After assignment'
	print X_train.shape[0], X_train.shape[1]
	print X_valid.shape[0], X_valid.shape[1]

	return X_train.astype('float32'), y_train.values, X_valid.astype('float32'), y_valid.values

	def preproc_test(self):
	test = pd.read_csv(self.filepath + self.test_fn)
	labels = test.toll_fraud.values
	test = self.fix_na(test)
	# Preproc Categorical Vars
	cat_vars = ['ruri_user',
	'from_user',
	'from_domain',
	'to_user',
	'contact_user',
	'user_agent',
	'source_ip',
	'contact_ip']

	cat_types = ['str', 'str', 'str', 'str', 'str', 'str', 'str', 'str']
	test_cat = self.create_dummies(test, cat_vars, cat_types)

	# Recombine
	test = test_cat
	return labels, test


	class HoneypotKeras(HoneypotData):
	def __init__(self):
	self.X_train, self.y_train, self.X_valid, self.y_valid = self.preproc()
	self.y_train, self.y_valid = to_categorical(self.y_train), to_categorical(self.y_valid)
	print 'To categorical'
	print(self.X_train.shape[0], self.X_train.shape[1])
	print(self.y_train.shape[0], self.y_train.shape[1])
	print(self.X_valid.shape[0], self.X_valid.shape[1])
	print(self.y_valid.shape[0], self.y_valid.shape[1])
	self.feature_count = self.X_train.shape[1]
	self.history = []

	def build_model(self):
	model = Sequential()
	model.add(Dense(2056, input_shape=(self.feature_count,), activation='relu'))
	model.add(Dropout(0.1))
	model.add(Dense(1028, activation='relu'))
	model.add(Dropout(0.2))
	model.add(Dense(1028, activation='relu'))
	model.add(Dropout(0.3))
	model.add(Dense(512, activation='relu'))
	model.add(Dropout(0.4))
	model.add(Dense(2, activation='sigmoid'))
	model.compile(optimizer='adam',
	loss='binary_crossentropy',
	metrics=['accuracy'])
	self.model = model

	def fit(self, lr=0.001, epochs=1):
	self.model.optimizer.lr = lr
	print('Training data:')
	print(self.X_train.shape[0], self.X_train.shape[1])
	print(self.y_train.shape[0], self.y_train.shape[1])
	print(self.X_valid.shape[0], self.X_valid.shape[1])
	print(self.y_valid.shape[0], self.y_valid.shape[1])

	hist = self.model.fit(self.X_train, self.y_train,
	batch_size=32, epochs=epochs,
	verbose=1, validation_data=(self.X_valid, self.y_valid),
	)
	self.history.append(hist)

	def prepare_submission(self, name):
	labels, test_data = self.preproc_test()
	predictions = self.model.predict(test_data)
	print predictions


	model = HoneypotKeras()
	# model.build_model()
	# model.fit(lr=0.01, epochs=5)
	# model.fit(lr=0.001, epochs=10)
	# model.prepare_submission('keras')