Created
April 27, 2018 05:43
-
-
Save arycloud/e8b4dae0a25cf22af33adda1132909ab to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This Python 3 environment comes with many helpful analytics libraries installed | |
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python | |
# For example, here's several helpful packages to load in | |
import numpy as np # linear algebra | |
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) | |
# Input data files are available in the "../input/" directory. | |
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory | |
# Any results you write to the current directory are saved as output. | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from sklearn.preprocessing import MinMaxScaler | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score\ | |
,fbeta_score,classification_report,confusion_matrix,precision_recall_curve,roc_auc_score\ | |
,roc_curve | |
from sklearn.model_selection import train_test_split | |
df_full = pd.read_excel('input/samp.xlsx', sheet_name=0,) | |
df_full = df_full[df_full.filter(regex='^(?!Unnamed)').columns] | |
df_full.drop(['paymentdetails',], 1, inplace=True) | |
df_full.drop(['timestamp'], 1, inplace=True) | |
# Handle non numaric data | |
def handle_non_numaric_data(df_full): | |
columns = df_full.columns.values | |
for column in columns: | |
text_digit_vals = {} | |
def convert_to_int(val): | |
return text_digit_vals[val] | |
if df_full[column].dtype != np.int64 and df_full[column].dtype != np.float64: | |
column_contents = df_full[column].values.tolist() | |
unique_elements = set(column_contents) | |
x = 0 | |
for unique in unique_elements: | |
if unique not in text_digit_vals: | |
text_digit_vals[unique] = x | |
x+=1 | |
df_full[column] = list(map(convert_to_int, df_full[column])) | |
return df_full | |
df_full = handle_non_numaric_data(df_full) | |
print(df_full.head()) | |
from sklearn.cluster import KMeans | |
from sklearn import preprocessing | |
################################################## | |
## Applied predefined conditions/criteria for classification | |
################################################## | |
# 2nd | |
#for convert to numeric | |
df_full['discount'] = pd.to_numeric(df_full['discount'], errors='coerce') | |
df_full['productdiscount'] = pd.to_numeric(df_full['discount'], errors='coerce') | |
df_full['Class'] = ((df_full['discount'] > 20) & | |
(df_full['tax'] == 0) & | |
(df_full['productdiscount'] > 20) & | |
(df_full['total'] > 100)).astype(int) | |
# Get some sample data from entire dataset | |
data = df_full.sample(frac = 0.5, random_state = 1) | |
print(data.shape) | |
data.isnull().sum() | |
# Convert excel data into matrix | |
columns = "invoiceid locationid timestamp customerid discount tax total subtotal productid quantity productprice productdiscount invoice_products_id producttax invoice_payments_id paymentmethod paymentdetails amount Class(0/1) Class".split() | |
X = pd.DataFrame.as_matrix(data, columns=columns) | |
Y = data.Class | |
# temp = np.array(temp).reshape((len(temp), 1) | |
Y = Y.values.reshape(Y.shape[0], 1) | |
X.shape | |
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.06) | |
X_test, X_dev, Y_test, Y_dev = train_test_split(X_test, Y_test, test_size = .5) | |
# Check if there is Classification Values - 0/1 in training set and other set | |
np.where(Y_train == 1) | |
np.where(Y_test == 1) | |
np.where(Y_dev == 1) | |
# Determine no of fraud cases in dataset | |
Fraud = data[data['Class'] == 1] | |
Valid = data[data['Class'] == 0] | |
# calculate percentages for Fraud & Valid | |
outlier_fraction = len(Fraud) / float(len(Valid)) | |
print('Fraud Cases : {}'.format(len(Fraud))) | |
print('Valid Cases : {}'.format(len(Valid))) | |
print(outlier_fraction) | |
# Get all the columns from dataframe | |
columns = data.columns.tolist() | |
# Filter the columns to remove data we don't want | |
columns = [c for c in columns if c not in ["Class"] ] | |
# store the variables we want to predicting on | |
target = "Class" | |
# for column in data.columns: | |
# if data[column].dtype == type(object): | |
# le = LabelEncoder() | |
# data[column] = le.fit_transform(data[column]) | |
# X = data[column] | |
# X = data[column] | |
# Y = data[target] | |
X = data.drop(target, 1) | |
Y = data[target] | |
# Print the shapes of X & Y | |
print(X.shape) | |
print(Y.shape) | |
from sklearn.metrics import classification_report, accuracy_score | |
from sklearn.ensemble import IsolationForest | |
from sklearn.neighbors import LocalOutlierFactor | |
# define a random state | |
state = 1 | |
# # define the outlier detection method | |
# clf = IsolationForest( | |
# max_samples=20, random_state=state) | |
classifiers = { | |
"Isolation Forest": IsolationForest(max_samples=len(X), | |
contamination=outlier_fraction, | |
random_state=state), | |
} | |
import pickle | |
# fit the model | |
n_outliers = len(Fraud) | |
for i, (clf_name, clf) in enumerate(classifiers.items()): | |
# fit te data and tag outliers | |
if clf_name == "Local Outlier Factor": | |
y_pred = clf.fit_predict(X) | |
print("LOF executed") | |
scores_pred = clf.negative_outlier_factor_ | |
# Export the classifier to a file | |
with open('model.pkl', 'wb') as model_file: | |
pickle.dump(clf, model_file) | |
else: | |
clf.fit(X) | |
scores_pred = clf.decision_function(X) | |
y_pred = clf.predict(X) | |
print("IF executed") | |
# Export the classifier to a file | |
with open('model.pkl', 'wb') as model_file: | |
pickle.dump(clf, model_file) | |
# Reshape the prediction values to 0 for valid and 1 for fraudulent | |
y_pred[y_pred == 1] = 0 | |
y_pred[y_pred == -1] = 1 | |
n_errors = (y_pred != Y).sum() | |
# run classification metrics | |
print('{}:{}'.format(clf_name, n_errors)) | |
print(accuracy_score(Y, y_pred )) | |
print(classification_report(Y, y_pred )) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment