Created
July 22, 2020 16:51
-
-
Save zredlined/a14d442c79620d67752db3bed1267bca to your computer and use it in GitHub Desktop.
SMOTE-like approach building training dataset for synthetics from K-nearest neighbors to minority class
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!pip install s3fs smart_open pandas sklearn | |
import pandas as pd | |
from smart_open import open | |
from sklearn.neighbors import NearestNeighbors | |
# Set params | |
NEAREST_NEIGHBOR_COUNT = 5 | |
TRAINING_SET = 's3://gretel-public-website/datasets/creditcard_train.csv' | |
# Separate out positive (non-fraud) and negative (fraud) sets | |
df = pd.read_csv(TRAINING_SET, nrows=999999).round(6) | |
positive = df[df['Class'] == 1] | |
negative = df[df['Class'] == 0] | |
# Train a nearest neighbors model on non-fraudulent records | |
neighbors = NearestNeighbors(n_neighbors=5, algorithm='ball_tree') | |
neighbors.fit(negative) | |
# Select the X nearest neighbors to our fraudulent records | |
nn = neighbors.kneighbors(positive, 5, return_distance=False) | |
nn_idx = list(set([item for sublist in nn for item in sublist])) | |
nearest_neighbors = negative.iloc[nn_idx, :] | |
nearest_neighbors | |
# Over-sample positive records and add nearest neighbor (shady, non-fraudulent) | |
# and shuffle the dataset | |
oversample = pd.concat([positive] * NEAREST_NEIGHBOR_COUNT) | |
training_set = pd.concat([oversample, nearest_neighbors]).sample(frac=1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment