zredlined/create_knn_dataset.py

## create_knn_dataset.py
#!pip install s3fs smart_open pandas sklearn

import pandas as pd
from smart_open import open
from sklearn.neighbors import NearestNeighbors

# Set params
NEAREST_NEIGHBOR_COUNT = 5
TRAINING_SET = 's3://gretel-public-website/datasets/creditcard_train.csv'

# Separate out positive (non-fraud) and negative (fraud) sets
df = pd.read_csv(TRAINING_SET, nrows=999999).round(6)
positive = df[df['Class'] == 1]
negative = df[df['Class'] == 0]

# Train a nearest neighbors model on non-fraudulent records
neighbors = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
neighbors.fit(negative)

# Select the X nearest neighbors to our fraudulent records
nn = neighbors.kneighbors(positive, 5, return_distance=False)
nn_idx = list(set([item for sublist in nn for item in sublist]))
nearest_neighbors = negative.iloc[nn_idx, :]
nearest_neighbors

# Over-sample positive records and add nearest neighbor (shady, non-fraudulent)
# and shuffle the dataset
oversample = pd.concat([positive] * NEAREST_NEIGHBOR_COUNT)
training_set = pd.concat([oversample, nearest_neighbors]).sample(frac=1)
	#!pip install s3fs smart_open pandas sklearn

	import pandas as pd
	from smart_open import open
	from sklearn.neighbors import NearestNeighbors

	# Set params
	NEAREST_NEIGHBOR_COUNT = 5
	TRAINING_SET = 's3://gretel-public-website/datasets/creditcard_train.csv'

	# Separate out positive (non-fraud) and negative (fraud) sets
	df = pd.read_csv(TRAINING_SET, nrows=999999).round(6)
	positive = df[df['Class'] == 1]
	negative = df[df['Class'] == 0]

	# Train a nearest neighbors model on non-fraudulent records
	neighbors = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
	neighbors.fit(negative)

	# Select the X nearest neighbors to our fraudulent records
	nn = neighbors.kneighbors(positive, 5, return_distance=False)
	nn_idx = list(set([item for sublist in nn for item in sublist]))
	nearest_neighbors = negative.iloc[nn_idx, :]
	nearest_neighbors

	# Over-sample positive records and add nearest neighbor (shady, non-fraudulent)
	# and shuffle the dataset
	oversample = pd.concat([positive] * NEAREST_NEIGHBOR_COUNT)
	training_set = pd.concat([oversample, nearest_neighbors]).sample(frac=1)