Skip to content

Instantly share code, notes, and snippets.

@andreea-anghel
Last active March 3, 2021 23:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andreea-anghel/f8ec69787e8582f4af8f88299d2ac064 to your computer and use it in GitHub Desktop.
Save andreea-anghel/f8ec69787e8582f4af8f88299d2ac064 to your computer and use it in GitHub Desktop.
Random Forest training acceleration with Snap ML for fast credit card fraud detection
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"```\n",
"Copyright 2021 IBM Corporation\n",
"\n",
"Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"you may not use this file except in compliance with the License.\n",
"You may obtain a copy of the License at\n",
"\n",
" http://www.apache.org/licenses/LICENSE-2.0\n",
"\n",
"Unless required by applicable law or agreed to in writing, software\n",
"distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"See the License for the specific language governing permissions and\n",
"limitations under the License.\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Random Forest on Credit Card Fraud Dataset\n",
"\n",
"## Background \n",
"\n",
"The goal of this learning task is to predict if a credit card transaction is fraudulent or genuine based on a set of anonymized features.\n",
"\n",
"## Source\n",
"\n",
"The raw dataset can be obtained directly from [Kaggle: Credit Card Fraud Detection](https://www.kaggle.com/mlg-ulb/creditcardfraud).\n",
"\n",
"In this example, we download the dataset directly from Kaggle using their API. In order for it to work, you must: login into Kaggle and folow [these instructions](https://www.kaggle.com/docs/api) to install your API token on your machine.\n",
"\n",
"## Goal\n",
"The goal of this notebook is to illustrate how Snap ML can accelerate training of a random forest model on this dataset.\n",
"\n",
"## Code"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"execution": {
"iopub.execute_input": "2021-03-01T20:58:11.328891Z",
"iopub.status.busy": "2021-03-01T20:58:11.327519Z",
"iopub.status.idle": "2021-03-01T20:58:11.332333Z",
"shell.execute_reply": "2021-03-01T20:58:11.333131Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/aan/snapml-examples/examples\n"
]
}
],
"source": [
"cd ../../"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"execution": {
"iopub.execute_input": "2021-03-01T20:58:11.338606Z",
"iopub.status.busy": "2021-03-01T20:58:11.337859Z",
"iopub.status.idle": "2021-03-01T20:58:11.340741Z",
"shell.execute_reply": "2021-03-01T20:58:11.339972Z"
}
},
"outputs": [],
"source": [
"CACHE_DIR='cache-dir'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"execution": {
"iopub.execute_input": "2021-03-01T20:58:11.346043Z",
"iopub.status.busy": "2021-03-01T20:58:11.345282Z",
"iopub.status.idle": "2021-03-01T20:58:12.123838Z",
"shell.execute_reply": "2021-03-01T20:58:12.124351Z"
}
},
"outputs": [],
"source": [
"import numpy as np\n",
"import time\n",
"from datasets import CreditCardFraud\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from snapml import RandomForestClassifier as SnapRandomForestClassifier\n",
"from sklearn.metrics import roc_auc_score as score"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"execution": {
"iopub.execute_input": "2021-03-01T20:58:12.128893Z",
"iopub.status.busy": "2021-03-01T20:58:12.128302Z",
"iopub.status.idle": "2021-03-01T20:58:14.585268Z",
"shell.execute_reply": "2021-03-01T20:58:14.585718Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reading binary CreditCardFraud dataset (cache) from disk.\n"
]
}
],
"source": [
"dataset = CreditCardFraud(cache_dir=CACHE_DIR)\n",
"X_train, X_test, y_train, y_test = dataset.get_train_test_split()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"execution": {
"iopub.execute_input": "2021-03-01T20:58:14.592259Z",
"iopub.status.busy": "2021-03-01T20:58:14.591363Z",
"iopub.status.idle": "2021-03-01T20:58:14.882827Z",
"shell.execute_reply": "2021-03-01T20:58:14.883229Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of examples: 213605\n",
"Number of features: 28\n",
"Number of classes: 2\n"
]
}
],
"source": [
"print(\"Number of examples: %d\" % (X_train.shape[0]))\n",
"print(\"Number of features: %d\" % (X_train.shape[1]))\n",
"print(\"Number of classes: %d\" % (len(np.unique(y_train))))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 99.83 % of the training transactions belong to class 0\n",
" 0.17 % of the training transactions belong to class 1\n"
]
}
],
"source": [
"# the dataset is highly imbalanced\n",
"labels, sizes = np.unique(y_train, return_counts=True)\n",
"print(\"%6.2f %% of the training transactions belong to class 0\" % (sizes[0]*100.0/(sizes[0]+sizes[1])))\n",
"print(\"%6.2f %% of the training transactions belong to class 1\" % (sizes[1]*100.0/(sizes[0]+sizes[1])))\n",
"\n",
"from sklearn.utils.class_weight import compute_sample_weight\n",
"w_train = compute_sample_weight('balanced', y_train)\n",
"w_test = compute_sample_weight('balanced', y_test)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"execution": {
"iopub.execute_input": "2021-03-01T20:58:14.887460Z",
"iopub.status.busy": "2021-03-01T20:58:14.886909Z",
"iopub.status.idle": "2021-03-01T21:06:31.452583Z",
"shell.execute_reply": "2021-03-01T21:06:31.452960Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training time (sklearn): 18.18 seconds\n",
"ROC AUC score (sklearn): 0.9700\n"
]
}
],
"source": [
"model = RandomForestClassifier(max_depth=8, n_estimators=100, n_jobs=4, random_state=42)\n",
"t0 = time.time()\n",
"model.fit(X_train, y_train, sample_weight=w_train)\n",
"t_fit_sklearn = time.time()-t0\n",
"score_sklearn = score(y_test, model.predict_proba(X_test)[:,1], sample_weight=w_test)\n",
"print(\"Training time (sklearn): %6.2f seconds\" % (t_fit_sklearn))\n",
"print(\"ROC AUC score (sklearn): %.4f\" % (score_sklearn))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"execution": {
"iopub.execute_input": "2021-03-01T21:06:31.457582Z",
"iopub.status.busy": "2021-03-01T21:06:31.457081Z",
"iopub.status.idle": "2021-03-01T21:06:55.802116Z",
"shell.execute_reply": "2021-03-01T21:06:55.802702Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training time (snapml): 2.20 seconds\n",
"ROC AUC score (snapml): 0.9790\n"
]
}
],
"source": [
"model = SnapRandomForestClassifier(max_depth=8, n_estimators=100, n_jobs=4, random_state=42, use_histograms=True)\n",
"t0 = time.time()\n",
"model.fit(X_train, y_train, sample_weight=w_train)\n",
"t_fit_snapml = time.time()-t0\n",
"score_snapml = score(y_test, model.predict_proba(X_test)[:,1], sample_weight=w_test)\n",
"print(\"Training time (snapml): %6.2f seconds\" % (t_fit_snapml))\n",
"print(\"ROC AUC score (snapml): %.4f\" % (score_snapml))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"execution": {
"iopub.execute_input": "2021-03-01T21:06:55.807228Z",
"iopub.status.busy": "2021-03-01T21:06:55.806709Z",
"iopub.status.idle": "2021-03-01T21:06:55.808571Z",
"shell.execute_reply": "2021-03-01T21:06:55.808977Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Speed-up: 8.2 x\n",
"Relative diff. in score: 0.0092\n"
]
}
],
"source": [
"speed_up = t_fit_sklearn/t_fit_snapml\n",
"score_diff = (score_snapml-score_sklearn)/score_sklearn\n",
"print(\"Speed-up: %.1f x\" % (speed_up))\n",
"print(\"Relative diff. in score: %.4f\" % (score_diff))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Disclaimer\n",
"\n",
"Performance results always depend on the hardware and software environment. \n",
"\n",
"Information regarding the environment that was used to run this notebook are provided below:"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"execution": {
"iopub.execute_input": "2021-03-01T21:06:55.813132Z",
"iopub.status.busy": "2021-03-01T21:06:55.812618Z",
"iopub.status.idle": "2021-03-01T21:06:55.855628Z",
"shell.execute_reply": "2021-03-01T21:06:55.856015Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" platform: Linux-4.15.0-136-generic-x86_64-with-glibc2.10\n",
" cpu_count: 16\n",
" cpu_freq_min: 1200.0\n",
" cpu_freq_max: 3200.0\n",
" total_memory: 62.825439453125\n",
" snapml_version: 1.7.0\n",
"sklearn_version: 0.24.1\n"
]
}
],
"source": [
"import utils\n",
"environment = utils.get_environment()\n",
"for k,v in environment.items():\n",
" print(\"%15s: %s\" % (k, v))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Record Statistics\n",
"\n",
"Finally, we record the enviroment and performance statistics for analysis outside of this standalone notebook."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"execution": {
"iopub.execute_input": "2021-03-01T21:06:55.860885Z",
"iopub.status.busy": "2021-03-01T21:06:55.860362Z",
"iopub.status.idle": "2021-03-01T21:06:56.414296Z",
"shell.execute_reply": "2021-03-01T21:06:56.414616Z"
}
},
"outputs": [
{
"data": {
"application/scrapbook.scrap.json+json": {
"data": {
"cpu_count": 16,
"cpu_freq_max": 3200,
"cpu_freq_min": 1200,
"dataset": "CreditCardFraud",
"model": "RandomForestClassifier",
"n_classes": 2,
"n_examples_test": 71202,
"n_examples_train": 213605,
"n_features": 28,
"platform": "Linux-4.15.0-136-generic-x86_64-with-glibc2.10",
"score": "roc_auc_score",
"score_diff": 0.009223910924074171,
"score_sklearn": 0.9700275097547026,
"score_snapml": 0.9789749570985815,
"sklearn_version": "0.24.1",
"snapml_version": "1.7.0",
"speed_up": 8.247310048238095,
"t_fit_sklearn": 18.176467657089233,
"t_fit_snapml": 2.2039268016815186,
"total_memory": 62.825439453125
},
"encoder": "json",
"name": "result",
"version": 1
}
},
"metadata": {
"scrapbook": {
"data": true,
"display": false,
"name": "result"
}
},
"output_type": "display_data"
}
],
"source": [
"import scrapbook as sb\n",
"sb.glue(\"result\", {\n",
" 'dataset': dataset.name,\n",
" 'n_examples_train': X_train.shape[0],\n",
" 'n_examples_test': X_test.shape[0],\n",
" 'n_features': X_train.shape[1],\n",
" 'n_classes': len(np.unique(y_train)),\n",
" 'model': type(model).__name__,\n",
" 'score': score.__name__,\n",
" 't_fit_sklearn': t_fit_sklearn,\n",
" 'score_sklearn': score_sklearn,\n",
" 't_fit_snapml': t_fit_snapml,\n",
" 'score_snapml': score_snapml,\n",
" 'score_diff': score_diff,\n",
" 'speed_up': speed_up,\n",
" **environment,\n",
"})"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment