Created
April 5, 2016 03:11
-
-
Save jameskyle/fb984c34010817f62f21f57bde415c51 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib | |
matplotlib.use('Agg') | |
import matplotlib.pyplot as plt | |
import seaborn | |
import os | |
import re | |
import util | |
import pandas as pd | |
import numpy as np | |
import datetime as dt | |
import KNNLearner as knn | |
import BagLearner as bg | |
import LinRegLearner as lrl | |
import best4KNN as bk | |
import best4linreg as bl | |
from collections import namedtuple | |
from mpl_toolkits.mplot3d import Axes3D | |
class Report(object): | |
def __init__(self, learners, datasets, train=0.6, trials=10, verbose=False): | |
self.learners = learners | |
self.datasets = { | |
self._data_name(d): util.normalize(np.genfromtxt(d, delimiter=",")) | |
for d in datasets | |
} | |
self.verbose = verbose | |
self.train = train | |
cols = ( | |
'Model', | |
'Bagged Model', | |
'k', | |
'bags', | |
'Data', | |
'Trial', | |
'RMSE', | |
'Corr', | |
'Samples', | |
'Features', | |
'iter', | |
'runtime', | |
'kfold' | |
) | |
self.results = pd.DataFrame(columns=cols, index=range(self.numtests)) | |
self.trials = trials | |
@property | |
def numtests(self): | |
# one test and one train row for each learner and each dataset | |
return len(self.learners) * len(self.datasets) * 2 | |
def _data_name(self, filename): | |
return os.path.splitext(os.path.basename(filename))[0] | |
def run(self): | |
index = iter(range(self.numtests * self.trials)) | |
current = 2 | |
for tnum in np.linspace(1,self.trials, self.trials, dtype=int): | |
for name, data in self.datasets.iteritems(): | |
train, test = self.partition(data) | |
for learner in self.learners: | |
self.log("Training {0} of {1}".format(current, self.numtests * self.trials)) | |
current += 2 | |
learner.addEvidence(train.x, train.y) | |
# In sample | |
rmse, corr, seconds = self.predict(learner, train) | |
self.addrow(next(index), name, learner, 'train', rmse, corr, train, seconds, tnum) | |
# Out of sample (test) | |
rmse, corr, seconds = self.predict(learner, test) | |
self.addrow(next(index), name, learner, 'test', rmse, corr, test, seconds, tnum) | |
def log(self, msg): | |
if self.verbose: | |
print(msg) | |
def addrow(self, index, name, learner, trial, rmse, corr, data, seconds, tnum): | |
samples = data.x.shape[0] | |
features = data.x.shape[1] | |
model = learner.__class__.__name__ | |
is_noise = re.compile(r'.*_part(\d+)$') | |
if isinstance(learner, knn.KNNLearner): | |
k = learner.k | |
elif isinstance(learner, bg.BagLearner): | |
k = learner.kwargs['k'] | |
else: | |
k = np.NaN | |
if isinstance(learner, bg.BagLearner): | |
bags = learner.num_bags | |
lname = learner.learner.__class__.__name__ | |
else: | |
bags = np.NaN | |
lname = np.NaN | |
noise = is_noise.match(name) | |
if noise: | |
noise = int(noise.group(1)) | |
else: | |
noise = 0 | |
self.results.loc[index] = ( | |
model, | |
lname, | |
k, | |
bags, | |
name, | |
trial, | |
rmse, | |
corr, | |
samples, | |
features, | |
noise, | |
seconds, | |
tnum, | |
) | |
def predict(self, learner, data): | |
start = dt.datetime.now() | |
predictions = learner.query(data.x) | |
end = dt.datetime.now() | |
seconds = (end - start).total_seconds() | |
rmse = self.rms(data.y, predictions) | |
corr = np.corrcoef(predictions, y=data.y)[0,1] | |
return rmse, corr, seconds | |
def rms(self, y1, y2): | |
frobenius = np.linalg.norm(y1 - y2) | |
return frobenius / np.sqrt(len(y1)) | |
def partition(self, data): | |
Data = namedtuple('Data', ['x', 'y']) | |
part = int(np.floor(self.train * data.shape[0])) | |
np.random.shuffle(data) | |
x = data[:part, 0:-1] | |
y = data[:part, -1] | |
train = Data(x=x, y=y) | |
x = data[part:, 0:-1] | |
y = data[part:, -1] | |
test = Data(x=x, y=y) | |
return train, test | |
def _plot_best4(self): | |
df = self.results | |
for idx,best in enumerate(['best4knn', 'best4linreg']): | |
plt.figure(idx+1) | |
fig, ax = plt.subplots() | |
labels = [] | |
for key, grp in df[df['Data'] == best].groupby(['Model', 'Trial']): | |
model, trial = key | |
if model == "KNNLearner": | |
grp.groupby('k').apply(lambda x: x['RMSE'].mean()).plot(ax=ax, kind='line', x='k', y='RMSE') | |
#ax = grp.plot(ax=ax, kind='line', x='k', y='RMSE') | |
labels.append("mean knn {0}".format(trial)) | |
elif model == 'LinRegLearner': | |
ax.axhline(grp['RMSE'].mean(), linewidth=2, linestyle="--", label=True) | |
labels.append("mean linreg {0}".format(trial)) | |
lines, _ = ax.get_legend_handles_labels() | |
ax.legend(lines, labels, loc='best') | |
fig.savefig('images/{best}_error.png'.format(best=best)) | |
def _plot_datasets(self): | |
fignum = 11 | |
for name, dataset in self.datasets.iteritems(): | |
fig = plt.figure(fignum) | |
ax = fig.add_subplot(111, projection='3d') | |
ax.scatter(dataset.T[0], dataset.T[1], dataset.T[2]) | |
fignum += 1 | |
fig.savefig('images/dataset_{name}.png'.format(name=name)) | |
def _plot_ripple_knn(self): | |
df = self.results | |
fig, ax = plt.subplots() | |
labels = [] | |
for key, grp in df[ | |
(df['Data'] == 'ripple') & | |
(df['Model'] == 'KNNLearner') | |
].groupby('Trial'): | |
ax = grp.groupby('k').apply(lambda x: x['RMSE'].mean()).plot( | |
ax=ax, kind='line', x='k', y='RMSE' | |
) | |
labels.append("mean {0}".format(key)) | |
lines, _ = ax.get_legend_handles_labels() | |
ax.legend(lines, labels, loc='best') | |
fig.savefig("images/ripple_knn_rmse.png") | |
def _plot_ripple_bagging(self): | |
df = self.results | |
fig, ax = plt.subplots() | |
labels = [] | |
for key, grp in df[ | |
((df['Data'] == 'ripple') & (df['Model'] == 'BagLearner')) & | |
(df['k'].isin([1,3, 4, 15])) | |
].groupby(['Trial', 'k']): | |
ax = grp.groupby('bags').apply(lambda x: x['RMSE'].mean()).plot( | |
ax=ax, kind='line', x='bags', y='RMSE', legend=True) | |
labels.append("mean {0}: k={1}".format(key[0], key[1])) | |
lines, _ = ax.get_legend_handles_labels() | |
ax.legend(lines, labels, loc='best') | |
fig.savefig("images/ripple_bagging_rmse.png") | |
def plot(self): | |
self._plot_best4() | |
self._plot_datasets() | |
self._plot_ripple_knn() | |
self._plot_ripple_bagging() | |
def save(self): | |
self.results.to_pickle("results.pkl") | |
def load(self, filename='results.pkl'): | |
self.results = pd.read_pickle(filename) | |
if __name__ == "__main__": | |
d1 = bk.generate(5) | |
d2 = bl.generate(1500) | |
bk.save(d1) | |
bl.save(d2) | |
learners = [] | |
for k in np.linspace(1,15,15, dtype=int): | |
learners.append(knn.KNNLearner(k=k)) | |
#for b in np.linspace(1,25,25, dtype=int): | |
for b in [1,5,10,15,20,25]: | |
learners.append(bg.BagLearner(knn.KNNLearner, {'k': k}, b, boost=False)) | |
report = Report( | |
[lrl.LinRegLearner(verbose=True)] + learners, | |
[ | |
#"Data/simple.csv", | |
"Data/ripple.csv", | |
#"Data/3_groups.csv", | |
"Data/best4knn.csv", | |
"Data/best4linreg.csv", | |
], | |
trials=10, | |
verbose=True | |
) | |
report.plot() | |
if os.path.exists("archive/results-05-10trials.pkl"): | |
report.load(filename="archive/results-05-10trials.pkl") | |
else: | |
report.run() | |
report.plot() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment