betterdatascience

## roc_auc.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['axes.spines.top'] = False
rcParams['axes.spines.right'] = False

df = pd.read_csv('winequality-white.csv', sep=';')
df.head()

## smote.py
X_train, X_test, y_train, y_test = train_test_split(
    X_sm, y_sm, test_size=0.25, random_state=42
)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)

print(f'Accuracy = {accuracy_score(y_test, preds):.2f}\nRecall = {recall_score(y_test, preds):.2f}\n')
cm = confusion_matrix(y_test, preds)

## smote.py
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)

X_sm, y_sm = sm.fit_resample(X, y)

print(f'''Shape of X before SMOTE: {X.shape}
Shape of X after SMOTE: {X_sm.shape}''')

print('\nBalance of positive and negative classes (%):')

## smote.py
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix

# Train
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
preds = model.predict(X_test)

# Evaluate
print(f'Accuracy = {accuracy_score(y_test, preds):.2f}\nRecall = {recall_score(y_test, preds):.2f}\n')

## smote.py
from sklearn.model_selection import train_test_split

X = merged.drop('TARGET', axis=1)
y = merged['TARGET']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

print(f'''% Positive class in Train = {np.round(y_train.value_counts(normalize=True)[1] * 100, 2)}

## smote.py
from sklearn.preprocessing import MinMaxScaler

# Scale only columns that have values greater than 1
to_scale = [col for col in df.columns if df[col].max() > 1]
mms = MinMaxScaler()
scaled = mms.fit_transform(merged[to_scale])
scaled = pd.DataFrame(scaled, columns=to_scale)

# Replace original columns with scaled ones
for col in scaled:

## smote.py
# Remap to integers
df['GENDER'] = [0 if x == 'M' else 1 for x in df['GENDER']]
df['CAR'] = [1 if x == 'Y' else 0 for x in df['CAR']]
df['REALITY'] = [1 if x == 'Y' else 0 for x in df['REALITY']]

# Create dummy variables
dummy_income_type = pd.get_dummies(df['INCOME_TYPE'], prefix='INC_TYPE', drop_first=True)
dummy_edu_type = pd.get_dummies(df['EDUCATION_TYPE'], prefix='EDU_TYPE', drop_first=True)
dummy_family_type = pd.get_dummies(df['FAMILY_TYPE'], prefix='FAM_TYPE', drop_first=True)
dummy_house_type = pd.get_dummies(df['HOUSE_TYPE'], prefix='HOUSE_TYPE', drop_first=True)

## smote.py
ax = df['TARGET'].value_counts().plot(kind='bar', figsize=(10, 6), fontsize=13, color='#087E8B')
ax.set_title('Credit card fraud (0 = normal, 1 = fraud)', size=20, pad=30)
ax.set_ylabel('Number of transactions', fontsize=14)

for i in ax.patches:
    ax.text(i.get_x() + 0.19, i.get_height() + 700, str(round(i.get_height(), 2)), fontsize=15)

## smote.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('credit_dataset.csv')
df.head()

## ridgelines.py
plt.figure()

ax, fig = joyplot(
    data=sydney[['MinTemp', 'MaxTemp', 'Month']],
    by='Month',
    column=['MinTemp', 'MaxTemp'],
    color=['#686de0', '#eb4d4b'],
    legend=True,
    alpha=0.85,
    figsize=(12, 8)
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from matplotlib import rcParams
	rcParams['axes.spines.top'] = False
	rcParams['axes.spines.right'] = False

	df = pd.read_csv('winequality-white.csv', sep=';')
	df.head()
	X_train, X_test, y_train, y_test = train_test_split(
	X_sm, y_sm, test_size=0.25, random_state=42
	)

	model = RandomForestClassifier(random_state=42)
	model.fit(X_train, y_train)
	preds = model.predict(X_test)

	print(f'Accuracy = {accuracy_score(y_test, preds):.2f}\nRecall = {recall_score(y_test, preds):.2f}\n')
	cm = confusion_matrix(y_test, preds)
	from imblearn.over_sampling import SMOTE

	sm = SMOTE(random_state=42)

	X_sm, y_sm = sm.fit_resample(X, y)

	print(f'''Shape of X before SMOTE: {X.shape}
	Shape of X after SMOTE: {X_sm.shape}''')

	print('\nBalance of positive and negative classes (%):')
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.metrics import accuracy_score, recall_score, confusion_matrix

	# Train
	model = RandomForestClassifier(random_state=42)
	model.fit(X_train, y_train)
	preds = model.predict(X_test)

	# Evaluate
	print(f'Accuracy = {accuracy_score(y_test, preds):.2f}\nRecall = {recall_score(y_test, preds):.2f}\n')
	from sklearn.model_selection import train_test_split

	X = merged.drop('TARGET', axis=1)
	y = merged['TARGET']

	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.25, random_state=42
	)

	print(f'''% Positive class in Train = {np.round(y_train.value_counts(normalize=True)[1] * 100, 2)}
	from sklearn.preprocessing import MinMaxScaler

	# Scale only columns that have values greater than 1
	to_scale = [col for col in df.columns if df[col].max() > 1]
	mms = MinMaxScaler()
	scaled = mms.fit_transform(merged[to_scale])
	scaled = pd.DataFrame(scaled, columns=to_scale)

	# Replace original columns with scaled ones
	for col in scaled:
	# Remap to integers
	df['GENDER'] = [0 if x == 'M' else 1 for x in df['GENDER']]
	df['CAR'] = [1 if x == 'Y' else 0 for x in df['CAR']]
	df['REALITY'] = [1 if x == 'Y' else 0 for x in df['REALITY']]

	# Create dummy variables
	dummy_income_type = pd.get_dummies(df['INCOME_TYPE'], prefix='INC_TYPE', drop_first=True)
	dummy_edu_type = pd.get_dummies(df['EDUCATION_TYPE'], prefix='EDU_TYPE', drop_first=True)
	dummy_family_type = pd.get_dummies(df['FAMILY_TYPE'], prefix='FAM_TYPE', drop_first=True)
	dummy_house_type = pd.get_dummies(df['HOUSE_TYPE'], prefix='HOUSE_TYPE', drop_first=True)
	ax = df['TARGET'].value_counts().plot(kind='bar', figsize=(10, 6), fontsize=13, color='#087E8B')
	ax.set_title('Credit card fraud (0 = normal, 1 = fraud)', size=20, pad=30)
	ax.set_ylabel('Number of transactions', fontsize=14)

	for i in ax.patches:
	ax.text(i.get_x() + 0.19, i.get_height() + 700, str(round(i.get_height(), 2)), fontsize=15)
	plt.figure()

	ax, fig = joyplot(
	data=sydney[['MinTemp', 'MaxTemp', 'Month']],
	by='Month',
	column=['MinTemp', 'MaxTemp'],
	color=['#686de0', '#eb4d4b'],
	legend=True,
	alpha=0.85,
	figsize=(12, 8)