Skip to content

Instantly share code, notes, and snippets.

@betterdatascience
betterdatascience / roc_auc.py
Created December 8, 2020 08:04
001_roc_auc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['axes.spines.top'] = False
rcParams['axes.spines.right'] = False
df = pd.read_csv('winequality-white.csv', sep=';')
df.head()
@betterdatascience
betterdatascience / smote.py
Created December 3, 2020 09:19
008_smote
X_train, X_test, y_train, y_test = train_test_split(
X_sm, y_sm, test_size=0.25, random_state=42
)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)
print(f'Accuracy = {accuracy_score(y_test, preds):.2f}\nRecall = {recall_score(y_test, preds):.2f}\n')
cm = confusion_matrix(y_test, preds)
@betterdatascience
betterdatascience / smote.py
Last active September 4, 2021 03:14
007_smote
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X, y)
print(f'''Shape of X before SMOTE: {X.shape}
Shape of X after SMOTE: {X_sm.shape}''')
print('\nBalance of positive and negative classes (%):')
@betterdatascience
betterdatascience / smote.py
Created December 3, 2020 06:21
006_smote
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix
# Train
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)
# Evaluate
print(f'Accuracy = {accuracy_score(y_test, preds):.2f}\nRecall = {recall_score(y_test, preds):.2f}\n')
@betterdatascience
betterdatascience / smote.py
Created December 3, 2020 06:19
005_smote
from sklearn.model_selection import train_test_split
X = merged.drop('TARGET', axis=1)
y = merged['TARGET']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25, random_state=42
)
print(f'''% Positive class in Train = {np.round(y_train.value_counts(normalize=True)[1] * 100, 2)}
@betterdatascience
betterdatascience / smote.py
Created December 3, 2020 06:06
004_smote
from sklearn.preprocessing import MinMaxScaler
# Scale only columns that have values greater than 1
to_scale = [col for col in df.columns if df[col].max() > 1]
mms = MinMaxScaler()
scaled = mms.fit_transform(merged[to_scale])
scaled = pd.DataFrame(scaled, columns=to_scale)
# Replace original columns with scaled ones
for col in scaled:
@betterdatascience
betterdatascience / smote.py
Last active September 16, 2021 15:40
003_smote
# Remap to integers
df['GENDER'] = [0 if x == 'M' else 1 for x in df['GENDER']]
df['CAR'] = [1 if x == 'Y' else 0 for x in df['CAR']]
df['REALITY'] = [1 if x == 'Y' else 0 for x in df['REALITY']]
# Create dummy variables
dummy_income_type = pd.get_dummies(df['INCOME_TYPE'], prefix='INC_TYPE', drop_first=True)
dummy_edu_type = pd.get_dummies(df['EDUCATION_TYPE'], prefix='EDU_TYPE', drop_first=True)
dummy_family_type = pd.get_dummies(df['FAMILY_TYPE'], prefix='FAM_TYPE', drop_first=True)
dummy_house_type = pd.get_dummies(df['HOUSE_TYPE'], prefix='HOUSE_TYPE', drop_first=True)
@betterdatascience
betterdatascience / smote.py
Created December 3, 2020 05:53
002_smote
ax = df['TARGET'].value_counts().plot(kind='bar', figsize=(10, 6), fontsize=13, color='#087E8B')
ax.set_title('Credit card fraud (0 = normal, 1 = fraud)', size=20, pad=30)
ax.set_ylabel('Number of transactions', fontsize=14)
for i in ax.patches:
ax.text(i.get_x() + 0.19, i.get_height() + 700, str(round(i.get_height(), 2)), fontsize=15)
@betterdatascience
betterdatascience / smote.py
Created December 3, 2020 05:29
001_smote
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('credit_dataset.csv')
df.head()
@betterdatascience
betterdatascience / ridgelines.py
Created December 1, 2020 08:37
005_ridgeline_plots
plt.figure()
ax, fig = joyplot(
data=sydney[['MinTemp', 'MaxTemp', 'Month']],
by='Month',
column=['MinTemp', 'MaxTemp'],
color=['#686de0', '#eb4d4b'],
legend=True,
alpha=0.85,
figsize=(12, 8)