arycloud/model.py

## model.py
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory


# Any results you write to the current directory are saved as output.

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score\
    ,fbeta_score,classification_report,confusion_matrix,precision_recall_curve,roc_auc_score\
    ,roc_curve
from sklearn.model_selection import train_test_split

df_full = pd.read_excel('input/samp.xlsx', sheet_name=0,)
df_full = df_full[df_full.filter(regex='^(?!Unnamed)').columns]
df_full.drop(['paymentdetails',], 1, inplace=True)
df_full.drop(['timestamp'], 1, inplace=True)
# Handle non numaric data
def handle_non_numaric_data(df_full):
    columns = df_full.columns.values

    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        if df_full[column].dtype != np.int64 and df_full[column].dtype != np.float64:
            column_contents = df_full[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1

            df_full[column] = list(map(convert_to_int, df_full[column]))

    return df_full
df_full = handle_non_numaric_data(df_full)
print(df_full.head())

from sklearn.cluster import KMeans
from sklearn import preprocessing
##################################################
## Applied predefined conditions/criteria for classification
##################################################
# 2nd
#for convert to numeric
df_full['discount'] = pd.to_numeric(df_full['discount'], errors='coerce')
df_full['productdiscount'] = pd.to_numeric(df_full['discount'], errors='coerce')
df_full['Class'] = ((df_full['discount'] > 20) &
                    (df_full['tax'] == 0) &
                    (df_full['productdiscount'] > 20) &
                    (df_full['total'] > 100)).astype(int)
# Get some sample data from entire dataset
data = df_full.sample(frac = 0.5, random_state = 1)

print(data.shape)
data.isnull().sum()
# Convert excel data into matrix
columns = "invoiceid locationid timestamp customerid discount tax total subtotal productid quantity productprice productdiscount invoice_products_id producttax invoice_payments_id paymentmethod paymentdetails amount Class(0/1) Class".split()
X = pd.DataFrame.as_matrix(data, columns=columns)
Y = data.Class
# temp = np.array(temp).reshape((len(temp), 1)
Y = Y.values.reshape(Y.shape[0], 1)
X.shape
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.06)
X_test, X_dev, Y_test, Y_dev = train_test_split(X_test, Y_test, test_size = .5)

# Check if there is Classification Values - 0/1 in training set and other set
np.where(Y_train == 1)
np.where(Y_test == 1)
np.where(Y_dev == 1)

# Determine no of fraud cases in dataset
Fraud = data[data['Class'] == 1]
Valid = data[data['Class'] == 0]

# calculate percentages for Fraud & Valid
outlier_fraction = len(Fraud) / float(len(Valid))

print('Fraud Cases : {}'.format(len(Fraud)))
print('Valid Cases : {}'.format(len(Valid)))
print(outlier_fraction)

# Get all the columns from dataframe
columns = data.columns.tolist()

# Filter the columns to remove data we don't want
columns = [c for c in columns if c not in ["Class"] ]

# store the variables we want to predicting on
target = "Class"
# for column in data.columns:
#     if data[column].dtype == type(object):
#         le = LabelEncoder()
#         data[column] = le.fit_transform(data[column])
#         X = data[column]
# X = data[column]
# Y = data[target]
X = data.drop(target, 1)
Y = data[target]

# Print the shapes of X & Y
print(X.shape)
print(Y.shape)


from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

# define a random state
state = 1

# # define the outlier detection method
# clf = IsolationForest(
#     max_samples=20, random_state=state)

classifiers = {
    "Isolation Forest": IsolationForest(max_samples=len(X),
                                       contamination=outlier_fraction,
                                       random_state=state),
}


import pickle
# fit the model
n_outliers = len(Fraud)

for i, (clf_name, clf) in enumerate(classifiers.items()):

    # fit te data and tag outliers
    if clf_name == "Local Outlier Factor":
        y_pred = clf.fit_predict(X)
        print("LOF executed")
        scores_pred = clf.negative_outlier_factor_
        # Export the classifier to a file
        with open('model.pkl', 'wb') as model_file:
            pickle.dump(clf, model_file)
    else:
        clf.fit(X)
        scores_pred = clf.decision_function(X)
        y_pred = clf.predict(X)
        print("IF executed")
        # Export the classifier to a file
        with open('model.pkl', 'wb') as model_file:
            pickle.dump(clf, model_file)
    # Reshape the prediction values to 0 for valid and 1 for fraudulent
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1

    n_errors = (y_pred != Y).sum()


# run classification metrics
print('{}:{}'.format(clf_name, n_errors))
print(accuracy_score(Y, y_pred ))
print(classification_report(Y, y_pred ))
	# This Python 3 environment comes with many helpful analytics libraries installed
	# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
	# For example, here's several helpful packages to load in

	import numpy as np # linear algebra
	import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

	# Input data files are available in the "../input/" directory.
	# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory


	# Any results you write to the current directory are saved as output.

	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.preprocessing import MinMaxScaler
	from sklearn.preprocessing import LabelEncoder
	from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score\
	,fbeta_score,classification_report,confusion_matrix,precision_recall_curve,roc_auc_score\
	,roc_curve
	from sklearn.model_selection import train_test_split

	df_full = pd.read_excel('input/samp.xlsx', sheet_name=0,)
	df_full = df_full[df_full.filter(regex='^(?!Unnamed)').columns]
	df_full.drop(['paymentdetails',], 1, inplace=True)
	df_full.drop(['timestamp'], 1, inplace=True)
	# Handle non numaric data
	def handle_non_numaric_data(df_full):
	columns = df_full.columns.values

	for column in columns:
	text_digit_vals = {}
	def convert_to_int(val):
	return text_digit_vals[val]

	if df_full[column].dtype != np.int64 and df_full[column].dtype != np.float64:
	column_contents = df_full[column].values.tolist()
	unique_elements = set(column_contents)
	x = 0
	for unique in unique_elements:
	if unique not in text_digit_vals:
	text_digit_vals[unique] = x
	x+=1

	df_full[column] = list(map(convert_to_int, df_full[column]))

	return df_full
	df_full = handle_non_numaric_data(df_full)
	print(df_full.head())

	from sklearn.cluster import KMeans
	from sklearn import preprocessing
	##################################################
	## Applied predefined conditions/criteria for classification
	##################################################
	# 2nd
	#for convert to numeric
	df_full['discount'] = pd.to_numeric(df_full['discount'], errors='coerce')
	df_full['productdiscount'] = pd.to_numeric(df_full['discount'], errors='coerce')
	df_full['Class'] = ((df_full['discount'] > 20) &
	(df_full['tax'] == 0) &
	(df_full['productdiscount'] > 20) &
	(df_full['total'] > 100)).astype(int)
	# Get some sample data from entire dataset
	data = df_full.sample(frac = 0.5, random_state = 1)

	print(data.shape)
	data.isnull().sum()
	# Convert excel data into matrix
	columns = "invoiceid locationid timestamp customerid discount tax total subtotal productid quantity productprice productdiscount invoice_products_id producttax invoice_payments_id paymentmethod paymentdetails amount Class(0/1) Class".split()
	X = pd.DataFrame.as_matrix(data, columns=columns)
	Y = data.Class
	# temp = np.array(temp).reshape((len(temp), 1)
	Y = Y.values.reshape(Y.shape[0], 1)
	X.shape
	X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.06)
	X_test, X_dev, Y_test, Y_dev = train_test_split(X_test, Y_test, test_size = .5)

	# Check if there is Classification Values - 0/1 in training set and other set
	np.where(Y_train == 1)
	np.where(Y_test == 1)
	np.where(Y_dev == 1)

	# Determine no of fraud cases in dataset
	Fraud = data[data['Class'] == 1]
	Valid = data[data['Class'] == 0]

	# calculate percentages for Fraud & Valid
	outlier_fraction = len(Fraud) / float(len(Valid))

	print('Fraud Cases : {}'.format(len(Fraud)))
	print('Valid Cases : {}'.format(len(Valid)))
	print(outlier_fraction)

	# Get all the columns from dataframe
	columns = data.columns.tolist()

	# Filter the columns to remove data we don't want
	columns = [c for c in columns if c not in ["Class"] ]

	# store the variables we want to predicting on
	target = "Class"
	# for column in data.columns:
	# if data[column].dtype == type(object):
	# le = LabelEncoder()
	# data[column] = le.fit_transform(data[column])
	# X = data[column]
	# X = data[column]
	# Y = data[target]
	X = data.drop(target, 1)
	Y = data[target]

	# Print the shapes of X & Y
	print(X.shape)
	print(Y.shape)


	from sklearn.metrics import classification_report, accuracy_score
	from sklearn.ensemble import IsolationForest
	from sklearn.neighbors import LocalOutlierFactor

	# define a random state
	state = 1

	# # define the outlier detection method
	# clf = IsolationForest(
	# max_samples=20, random_state=state)

	classifiers = {
	"Isolation Forest": IsolationForest(max_samples=len(X),
	contamination=outlier_fraction,
	random_state=state),
	}


	import pickle
	# fit the model
	n_outliers = len(Fraud)

	for i, (clf_name, clf) in enumerate(classifiers.items()):

	# fit te data and tag outliers
	if clf_name == "Local Outlier Factor":
	y_pred = clf.fit_predict(X)
	print("LOF executed")
	scores_pred = clf.negative_outlier_factor_
	# Export the classifier to a file
	with open('model.pkl', 'wb') as model_file:
	pickle.dump(clf, model_file)
	else:
	clf.fit(X)
	scores_pred = clf.decision_function(X)
	y_pred = clf.predict(X)
	print("IF executed")
	# Export the classifier to a file
	with open('model.pkl', 'wb') as model_file:
	pickle.dump(clf, model_file)
	# Reshape the prediction values to 0 for valid and 1 for fraudulent
	y_pred[y_pred == 1] = 0
	y_pred[y_pred == -1] = 1

	n_errors = (y_pred != Y).sum()


	# run classification metrics
	print('{}:{}'.format(clf_name, n_errors))
	print(accuracy_score(Y, y_pred ))
	print(classification_report(Y, y_pred ))