Skip to content

Instantly share code, notes, and snippets.

@arycloud
Created April 27, 2018 05:43
Show Gist options
  • Save arycloud/e8b4dae0a25cf22af33adda1132909ab to your computer and use it in GitHub Desktop.
Save arycloud/e8b4dae0a25cf22af33adda1132909ab to your computer and use it in GitHub Desktop.
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
# Any results you write to the current directory are saved as output.
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score\
,fbeta_score,classification_report,confusion_matrix,precision_recall_curve,roc_auc_score\
,roc_curve
from sklearn.model_selection import train_test_split
df_full = pd.read_excel('input/samp.xlsx', sheet_name=0,)
df_full = df_full[df_full.filter(regex='^(?!Unnamed)').columns]
df_full.drop(['paymentdetails',], 1, inplace=True)
df_full.drop(['timestamp'], 1, inplace=True)
# Handle non numaric data
def handle_non_numaric_data(df_full):
columns = df_full.columns.values
for column in columns:
text_digit_vals = {}
def convert_to_int(val):
return text_digit_vals[val]
if df_full[column].dtype != np.int64 and df_full[column].dtype != np.float64:
column_contents = df_full[column].values.tolist()
unique_elements = set(column_contents)
x = 0
for unique in unique_elements:
if unique not in text_digit_vals:
text_digit_vals[unique] = x
x+=1
df_full[column] = list(map(convert_to_int, df_full[column]))
return df_full
df_full = handle_non_numaric_data(df_full)
print(df_full.head())
from sklearn.cluster import KMeans
from sklearn import preprocessing
##################################################
## Applied predefined conditions/criteria for classification
##################################################
# 2nd
#for convert to numeric
df_full['discount'] = pd.to_numeric(df_full['discount'], errors='coerce')
df_full['productdiscount'] = pd.to_numeric(df_full['discount'], errors='coerce')
df_full['Class'] = ((df_full['discount'] > 20) &
(df_full['tax'] == 0) &
(df_full['productdiscount'] > 20) &
(df_full['total'] > 100)).astype(int)
# Get some sample data from entire dataset
data = df_full.sample(frac = 0.5, random_state = 1)
print(data.shape)
data.isnull().sum()
# Convert excel data into matrix
columns = "invoiceid locationid timestamp customerid discount tax total subtotal productid quantity productprice productdiscount invoice_products_id producttax invoice_payments_id paymentmethod paymentdetails amount Class(0/1) Class".split()
X = pd.DataFrame.as_matrix(data, columns=columns)
Y = data.Class
# temp = np.array(temp).reshape((len(temp), 1)
Y = Y.values.reshape(Y.shape[0], 1)
X.shape
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.06)
X_test, X_dev, Y_test, Y_dev = train_test_split(X_test, Y_test, test_size = .5)
# Check if there is Classification Values - 0/1 in training set and other set
np.where(Y_train == 1)
np.where(Y_test == 1)
np.where(Y_dev == 1)
# Determine no of fraud cases in dataset
Fraud = data[data['Class'] == 1]
Valid = data[data['Class'] == 0]
# calculate percentages for Fraud & Valid
outlier_fraction = len(Fraud) / float(len(Valid))
print('Fraud Cases : {}'.format(len(Fraud)))
print('Valid Cases : {}'.format(len(Valid)))
print(outlier_fraction)
# Get all the columns from dataframe
columns = data.columns.tolist()
# Filter the columns to remove data we don't want
columns = [c for c in columns if c not in ["Class"] ]
# store the variables we want to predicting on
target = "Class"
# for column in data.columns:
# if data[column].dtype == type(object):
# le = LabelEncoder()
# data[column] = le.fit_transform(data[column])
# X = data[column]
# X = data[column]
# Y = data[target]
X = data.drop(target, 1)
Y = data[target]
# Print the shapes of X & Y
print(X.shape)
print(Y.shape)
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
# define a random state
state = 1
# # define the outlier detection method
# clf = IsolationForest(
# max_samples=20, random_state=state)
classifiers = {
"Isolation Forest": IsolationForest(max_samples=len(X),
contamination=outlier_fraction,
random_state=state),
}
import pickle
# fit the model
n_outliers = len(Fraud)
for i, (clf_name, clf) in enumerate(classifiers.items()):
# fit te data and tag outliers
if clf_name == "Local Outlier Factor":
y_pred = clf.fit_predict(X)
print("LOF executed")
scores_pred = clf.negative_outlier_factor_
# Export the classifier to a file
with open('model.pkl', 'wb') as model_file:
pickle.dump(clf, model_file)
else:
clf.fit(X)
scores_pred = clf.decision_function(X)
y_pred = clf.predict(X)
print("IF executed")
# Export the classifier to a file
with open('model.pkl', 'wb') as model_file:
pickle.dump(clf, model_file)
# Reshape the prediction values to 0 for valid and 1 for fraudulent
y_pred[y_pred == 1] = 0
y_pred[y_pred == -1] = 1
n_errors = (y_pred != Y).sum()
# run classification metrics
print('{}:{}'.format(clf_name, n_errors))
print(accuracy_score(Y, y_pred ))
print(classification_report(Y, y_pred ))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment