Skip to content

Instantly share code, notes, and snippets.

@jameskyle
Created April 5, 2016 03:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save jameskyle/fb984c34010817f62f21f57bde415c51 to your computer and use it in GitHub Desktop.
Save jameskyle/fb984c34010817f62f21f57bde415c51 to your computer and use it in GitHub Desktop.
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn
import os
import re
import util
import pandas as pd
import numpy as np
import datetime as dt
import KNNLearner as knn
import BagLearner as bg
import LinRegLearner as lrl
import best4KNN as bk
import best4linreg as bl
from collections import namedtuple
from mpl_toolkits.mplot3d import Axes3D
class Report(object):
def __init__(self, learners, datasets, train=0.6, trials=10, verbose=False):
self.learners = learners
self.datasets = {
self._data_name(d): util.normalize(np.genfromtxt(d, delimiter=","))
for d in datasets
}
self.verbose = verbose
self.train = train
cols = (
'Model',
'Bagged Model',
'k',
'bags',
'Data',
'Trial',
'RMSE',
'Corr',
'Samples',
'Features',
'iter',
'runtime',
'kfold'
)
self.results = pd.DataFrame(columns=cols, index=range(self.numtests))
self.trials = trials
@property
def numtests(self):
# one test and one train row for each learner and each dataset
return len(self.learners) * len(self.datasets) * 2
def _data_name(self, filename):
return os.path.splitext(os.path.basename(filename))[0]
def run(self):
index = iter(range(self.numtests * self.trials))
current = 2
for tnum in np.linspace(1,self.trials, self.trials, dtype=int):
for name, data in self.datasets.iteritems():
train, test = self.partition(data)
for learner in self.learners:
self.log("Training {0} of {1}".format(current, self.numtests * self.trials))
current += 2
learner.addEvidence(train.x, train.y)
# In sample
rmse, corr, seconds = self.predict(learner, train)
self.addrow(next(index), name, learner, 'train', rmse, corr, train, seconds, tnum)
# Out of sample (test)
rmse, corr, seconds = self.predict(learner, test)
self.addrow(next(index), name, learner, 'test', rmse, corr, test, seconds, tnum)
def log(self, msg):
if self.verbose:
print(msg)
def addrow(self, index, name, learner, trial, rmse, corr, data, seconds, tnum):
samples = data.x.shape[0]
features = data.x.shape[1]
model = learner.__class__.__name__
is_noise = re.compile(r'.*_part(\d+)$')
if isinstance(learner, knn.KNNLearner):
k = learner.k
elif isinstance(learner, bg.BagLearner):
k = learner.kwargs['k']
else:
k = np.NaN
if isinstance(learner, bg.BagLearner):
bags = learner.num_bags
lname = learner.learner.__class__.__name__
else:
bags = np.NaN
lname = np.NaN
noise = is_noise.match(name)
if noise:
noise = int(noise.group(1))
else:
noise = 0
self.results.loc[index] = (
model,
lname,
k,
bags,
name,
trial,
rmse,
corr,
samples,
features,
noise,
seconds,
tnum,
)
def predict(self, learner, data):
start = dt.datetime.now()
predictions = learner.query(data.x)
end = dt.datetime.now()
seconds = (end - start).total_seconds()
rmse = self.rms(data.y, predictions)
corr = np.corrcoef(predictions, y=data.y)[0,1]
return rmse, corr, seconds
def rms(self, y1, y2):
frobenius = np.linalg.norm(y1 - y2)
return frobenius / np.sqrt(len(y1))
def partition(self, data):
Data = namedtuple('Data', ['x', 'y'])
part = int(np.floor(self.train * data.shape[0]))
np.random.shuffle(data)
x = data[:part, 0:-1]
y = data[:part, -1]
train = Data(x=x, y=y)
x = data[part:, 0:-1]
y = data[part:, -1]
test = Data(x=x, y=y)
return train, test
def _plot_best4(self):
df = self.results
for idx,best in enumerate(['best4knn', 'best4linreg']):
plt.figure(idx+1)
fig, ax = plt.subplots()
labels = []
for key, grp in df[df['Data'] == best].groupby(['Model', 'Trial']):
model, trial = key
if model == "KNNLearner":
grp.groupby('k').apply(lambda x: x['RMSE'].mean()).plot(ax=ax, kind='line', x='k', y='RMSE')
#ax = grp.plot(ax=ax, kind='line', x='k', y='RMSE')
labels.append("mean knn {0}".format(trial))
elif model == 'LinRegLearner':
ax.axhline(grp['RMSE'].mean(), linewidth=2, linestyle="--", label=True)
labels.append("mean linreg {0}".format(trial))
lines, _ = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')
fig.savefig('images/{best}_error.png'.format(best=best))
def _plot_datasets(self):
fignum = 11
for name, dataset in self.datasets.iteritems():
fig = plt.figure(fignum)
ax = fig.add_subplot(111, projection='3d')
ax.scatter(dataset.T[0], dataset.T[1], dataset.T[2])
fignum += 1
fig.savefig('images/dataset_{name}.png'.format(name=name))
def _plot_ripple_knn(self):
df = self.results
fig, ax = plt.subplots()
labels = []
for key, grp in df[
(df['Data'] == 'ripple') &
(df['Model'] == 'KNNLearner')
].groupby('Trial'):
ax = grp.groupby('k').apply(lambda x: x['RMSE'].mean()).plot(
ax=ax, kind='line', x='k', y='RMSE'
)
labels.append("mean {0}".format(key))
lines, _ = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')
fig.savefig("images/ripple_knn_rmse.png")
def _plot_ripple_bagging(self):
df = self.results
fig, ax = plt.subplots()
labels = []
for key, grp in df[
((df['Data'] == 'ripple') & (df['Model'] == 'BagLearner')) &
(df['k'].isin([1,3, 4, 15]))
].groupby(['Trial', 'k']):
ax = grp.groupby('bags').apply(lambda x: x['RMSE'].mean()).plot(
ax=ax, kind='line', x='bags', y='RMSE', legend=True)
labels.append("mean {0}: k={1}".format(key[0], key[1]))
lines, _ = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')
fig.savefig("images/ripple_bagging_rmse.png")
def plot(self):
self._plot_best4()
self._plot_datasets()
self._plot_ripple_knn()
self._plot_ripple_bagging()
def save(self):
self.results.to_pickle("results.pkl")
def load(self, filename='results.pkl'):
self.results = pd.read_pickle(filename)
if __name__ == "__main__":
d1 = bk.generate(5)
d2 = bl.generate(1500)
bk.save(d1)
bl.save(d2)
learners = []
for k in np.linspace(1,15,15, dtype=int):
learners.append(knn.KNNLearner(k=k))
#for b in np.linspace(1,25,25, dtype=int):
for b in [1,5,10,15,20,25]:
learners.append(bg.BagLearner(knn.KNNLearner, {'k': k}, b, boost=False))
report = Report(
[lrl.LinRegLearner(verbose=True)] + learners,
[
#"Data/simple.csv",
"Data/ripple.csv",
#"Data/3_groups.csv",
"Data/best4knn.csv",
"Data/best4linreg.csv",
],
trials=10,
verbose=True
)
report.plot()
if os.path.exists("archive/results-05-10trials.pkl"):
report.load(filename="archive/results-05-10trials.pkl")
else:
report.run()
report.plot()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment