Skip to content

Instantly share code, notes, and snippets.

@Deccludor
Last active December 7, 2020 04:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Deccludor/d42a91712b427a45ff61aacfc02d0abe to your computer and use it in GitHub Desktop.
Save Deccludor/d42a91712b427a45ff61aacfc02d0abe to your computer and use it in GitHub Desktop.
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import calibration
import sklearn
import itertools
import time
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from functools import reduce
def load_data(file):
feature_cols = ['cha', 'con', 'dex', 'int', 'str', 'wis']
target_col = 'result'
data = pd.read_csv(file)
data[target_col] = (data[target_col] == 'succeed').astype(int)
np.random.seed(42)
data['rand_col'] = np.random.random(len(data))
train_data = data[data['rand_col'] <= 0.8]
test_data = data[data['rand_col'] > 0.8]
return (
train_data[feature_cols], train_data[target_col],
test_data[feature_cols], test_data[target_col],
)
def plot_score_distribution(predict_df, y):
plt.figure(figsize=(15,7))
plt.hist(predict_df[y==0], bins=50, label='Fail')
plt.hist(predict_df[y==1], bins=50, label='Succeed', alpha=0.7, color='r')
plt.xlabel('Score', fontsize=25)
plt.ylabel('Number of records in bucket', fontsize=25)
plt.legend(fontsize=15)
plt.tick_params(axis='both', labelsize=25, pad=5)
plt.show()
def evaluate(model, df_X, df_y):
predict_df = model.predict(df_X)
accuracy = sklearn.metrics.accuracy_score(predict_df, df_y)
log_loss = sklearn.metrics.log_loss(predict_df, df_y)
print('Log loss: ', log_loss)
print('Accuracy: ', accuracy)
sklearn.metrics.plot_confusion_matrix(model, df_X, df_y)
sklearn.metrics.plot_roc_curve(model, df_X, df_y)
plt.title('ROC Curve')
plt.show()
predict_proba_df = model.predict_proba(df_X)[:, 1]
plot_score_distribution(predict_proba_df, df_y)
fraction_of_positives, mean_predicted_value = sklearn.calibration.calibration_curve(df_y, predict_proba_df, n_bins=10)
plt.plot(fraction_of_positives, mean_predicted_value)
plt.xlabel('Fraction of success')
plt.ylabel('Mean score')
plt.title('Calibration Curve')
plt.show()
train_X, train_y, test_X, test_y = load_data('d_and_d_sci.csv')
# Run grid search to optimize model hyperparameters
# def layer_combinations(layer_sizes, depth_range):
# def add_combinations(acc, n):
# return acc + list(itertools.product(layer_sizes, repeat=n))
# return reduce(add_combinations, depth_range, [])
# param_grid = {
# 'hidden_layer_sizes': layer_combinations([8, 16, 32, 64], range(2, 5)),
# 'batch_size': [4, 8, 16, 32],
# }
# mlp = MLPClassifier(solver='adam', activation='relu', max_iter=500)
# clf = GridSearchCV(mlp, param_grid, cv=3, n_jobs=10, verbose=100)
# clf.fit(train_X, train_y)
# print([(k,v[clf.best_index_]) for k, v in clf.cv_results_.items()])
# [('mean_fit_time', 14.796302636464437),
# ('std_fit_time', 4.5226648190155645),
# ('mean_score_time', 0.003503084182739258),
# ('std_score_time', 0.0004896468711063906),
# ('param_batch_size', 16),
# ('param_hidden_layer_sizes', (8, 64)),
# ('params', {'batch_size': 16, 'hidden_layer_sizes': (8, 64)}),
# ('split0_test_score', 0.7054342305738954),
# ('split1_test_score', 0.6991869918699187),
# ('split2_test_score', 0.7078252032520326),
# ('mean_test_score', 0.7041488085652823),
# ('std_test_score', 0.0036417858508970267),
# ('rank_test_score', 1)]
scorer = MLPClassifier(solver='adam', activation='relu', max_iter=500, batch_size=16, hidden_layer_sizes=(8,64))
scorer.fit(train_X, train_y)
evaluate(scorer, test_X, test_y)
def random_search(predictor, base_stats, num_points, n_samples):
increases = np.unique(
np.rint(
np.random.dirichlet(
alpha=(1.5, 1.5, 1.5, 1.5, 1.5, 1.5),
size=n_samples
) * num_points
),
axis=0
)
df = pd.DataFrame(increases, columns=['cha', 'con', 'dex', 'int', 'str', 'wis'])
df = df[(df['cha'] + df['con'] + df['dex'] + df['int'] + df['str'] + df['wis']) == 10]
df = df + base_stats
df = df[(df['cha'] < 21) & (df['con'] < 21) & (df['dex'] < 21) & (df['int'] < 21) & (df['str'] < 21) & (df['wis'] < 21)]
df['score'] = predictor.predict_proba(df)[:, 1]
return df.sort_values('score', ascending=False)
print('Top 10 scores selected by random search: ', random_search(scorer, (4, 14, 13, 13, 6, 12), 10, 100000).head(10))
def greedy_search(predictor, base_stats, num_points):
if num_points == 0:
return base_stats, predictor.predict_proba([base_stats])[:,1]
candidates = pd.DataFrame(np.eye(len(base_stats)) + base_stats)
for col in candidates.columns:
candidates = candidates[candidates[col] < 21]
candidates['score'] = predictor.predict_proba(candidates)[:, 1]
candidates = candidates.sort_values('score', ascending=False)
best_candidate = tuple(candidates.iloc[0][:-1])
return greedy_search(predictor, best_candidate, num_points-1)
print('Top score selected by greedy search: ', greedy_search(scorer, (4, 14, 13, 13, 6, 12), 10))
def compile_candidates(n_stats, depth, candidates=None):
if depth == 0:
return np.array(list(candidates))
allocations = np.eye(n_stats)
if candidates is None:
return compile_candidates(n_stats, depth-1, allocations)
else:
return compile_candidates(n_stats, depth-1, map(sum, itertools.product(allocations, candidates)))
def lookahead_search(predictor, base_stats, num_points, max_step=5, kept_candidates=10):
remaining_points = num_points
candidates = None
while remaining_points > 0:
point_spend = min(max_step, remaining_points)
remaining_points -= point_spend
new_candidates = compile_candidates(len(base_stats), point_spend)
if candidates is None:
candidates = pd.DataFrame(new_candidates + base_stats)
else:
candidates = pd.DataFrame(np.unique(list(map(sum, itertools.product(new_candidates, candidates))), axis=0))
for col in candidates.columns:
candidates = candidates[candidates[col] < 21]
candidates['score'] = predictor.predict_proba(candidates)[:, 1]
candidates = candidates.sort_values('score', ascending=False)
candidates = np.array(candidates.iloc[:kept_candidates, :-1])
return list(zip(candidates, predictor.predict_proba(candidates)[:, 1]))
print('Top 10 scores selected by lookahead search: ', lookahead_search(scorer, (4, 14, 13, 13, 6, 12), 10, max_step=7, kept_candidates=10))
@Deccludor
Copy link
Author

Metrics for best predictor on test data:
confusion_matrix
roc_curve
score_distribution
calibration_curve

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment