tomasonjo/fraud_model.py

## fraud_model.py
def feature_importance(columns, classifier):
    features = list(zip(columns, classifier.feature_importances_))
    sorted_features = sorted(features, key = lambda x: x[1]*-1)

    keys = [value[0] for value in sorted_features]
    values = [value[1] for value in sorted_features]
    return pd.DataFrame(data={'feature': keys, 'value': values})

def evaluate(df):
    df_X = df.drop('fraudRisk', axis=1)
    df_y = df[['fraudRisk']]

    X = df_X.values
    y = df_y.values

    y = LabelBinarizer().fit_transform(y)

    # Test/train data split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

    # Oversample only the training data
    oversample = SMOTE(random_state=42)
    X_train, y_train = oversample.fit_resample(X_train, y_train)

    # Random forrest classification
    model = RandomForestClassifier(n_estimators=500, random_state=42, max_depth=5, bootstrap=True, class_weight='balanced')
    model = model.fit(X_train, y_train)
    # Evaluate the model
    ConfusionMatrixDisplay.from_estimator(model, X_test, y_test, normalize= 'true')
    RocCurveDisplay.from_estimator(model, X_test, y_test, name="RF Model")
    print(feature_importance(df_X.columns.to_list(), model))
	def feature_importance(columns, classifier):
	features = list(zip(columns, classifier.feature_importances_))
	sorted_features = sorted(features, key = lambda x: x[1]*-1)

	keys = [value[0] for value in sorted_features]
	values = [value[1] for value in sorted_features]
	return pd.DataFrame(data={'feature': keys, 'value': values})

	def evaluate(df):
	df_X = df.drop('fraudRisk', axis=1)
	df_y = df[['fraudRisk']]

	X = df_X.values
	y = df_y.values

	y = LabelBinarizer().fit_transform(y)

	# Test/train data split
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

	# Oversample only the training data
	oversample = SMOTE(random_state=42)
	X_train, y_train = oversample.fit_resample(X_train, y_train)

	# Random forrest classification
	model = RandomForestClassifier(n_estimators=500, random_state=42, max_depth=5, bootstrap=True, class_weight='balanced')
	model = model.fit(X_train, y_train)
	# Evaluate the model
	ConfusionMatrixDisplay.from_estimator(model, X_test, y_test, normalize= 'true')
	RocCurveDisplay.from_estimator(model, X_test, y_test, name="RF Model")
	print(feature_importance(df_X.columns.to_list(), model))