armgilles/tsne-transform.py

## tsne-transform.py
# Author: HJ van Veen <info@mlwave.com>
# Description: Experiment to learn a tSNE transformer for new
#              test data with a multi-output GBM
#
# Idea first seen at lvdmaaten.github.io/tsne
# > [...] it is not possible to embed test points in an existing
# > map [...]
# > A potential approach to deal with this would be to train
# > a multivariate regressor to predict the map location from
# > the input data.
#
# Part of code adapted from Fabian Pedregosa, Olivier Grisel,
#                           Mathieu Blondel, Gael Varoquaux,
# originally licensed under "BSD 3 clause (C) INRIA 2011".

# Original script from @MLWave :https://gist.github.com/MLWave/4a3f8b0fee43d45646cf118bda4d202a

from sklearn import (manifold, datasets, preprocessing, model_selection,
                     decomposition, metrics, multioutput)
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import numpy as np

# For data we use 6 different digit classes of 8x8 pixels
digits = datasets.load_digits(n_class=6)
X = digits.data # (1083, 64)
y = digits.target # (1083, )

# Split the data into 66% train and 33% test set.
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,
                                                                    y,
                                                                    test_size=0.33,
                                                                    random_state=0)

# First, PCA 2-D (which has .transform()) to illustrate and evaluate
lens = decomposition.PCA(n_components=2, random_state=0)
X_lens_train = lens.fit_transform(X_train)
X_lens_test = lens.transform(X_test)

# Normalize the lens within 0-1
scaler = preprocessing.MinMaxScaler()
X_lens_train = scaler.fit_transform(X_lens_train)
X_lens_test = scaler.transform(X_lens_test)

# Fit a model and predict the lens values from the original features
model = XGBRegressor(n_estimators=2000, max_depth=20, learning_rate=0.01)
model = multioutput.MultiOutputRegressor(model)
model.fit(X_train, X_lens_train)
preds = model.predict(X_test)

# Evaluate exhaustively
print("PREDICTION\t\tGROUND TRUTH")
for p, g in zip(preds, X_lens_test):
    print(p, g)
print("MAE", metrics.mean_absolute_error(X_lens_test, preds))

# Now TSNE (which has no .transform()) and a visual evaluation
lens = manifold.TSNE(n_components=2, init='pca', random_state=0)
X_lens_train = lens.fit_transform(X_train)

# Normalize the lens within 0-1
X_lens_train = scaler.fit_transform(X_lens_train)

# Fit a model and predict the lens values from the original features
model.fit(X_train, X_lens_train)
X_tsne = model.predict(X_test)


# New part
import pandas as pd

result = pd.DataFrame({'x_pred' : X_tsne[:, 0], 'y_pred' : X_tsne[:, 1],
                        'x_true' : X_lens_test[:, 0], 'y_true' : X_lens_test[:, 1]})

result = result[['x_true', 'x_pred', 'y_true', 'y_pred']]

# Calcul errors (MAE) for each point :
result['mae'] = result.apply(lambda row: metrics.mean_absolute_error([row.x_true, row.y_true],
                                                                     [row.x_pred, row.y_pred]), axis=1)

# Visualize error and direction
import matplotlib.colors as colors
import matplotlib.cm as cmx
import matplotlib as mpl


mae_q_8 = result.mae.quantile(0.8)

cmap = plt.cm.viridis

cNorm  = colors.Normalize(vmin=result.mae.quantile(0.70), vmax=result.mae.max())

scalarMap = cmx.ScalarMappable(norm=cNorm,cmap=cmap)

fig = plt.figure()

ax  = fig.add_axes([0.1, 0.1, 0.7, 0.85])
axc = fig.add_axes([0.85, 0.10, 0.05, 0.85])

for i in range(X_tsne.shape[0]):
    # We take only error >= quantile 0.8 MAE
    if result.loc[i].mae >= mae_q_8:
        ax.text(X_lens_test[i, 0], X_lens_test[i, 1], str(y_test[i]),
             color=plt.cm.Set1(y_test[i] / 7.),
             fontdict={'weight': 'bold', 'size': 8})
        ax.text(X_tsne[i, 0], X_tsne[i, 1], str(y_test[i]),
             color=plt.cm.Set1(y_test[i] / 7.3),
             fontdict={'weight': 'bold', 'size': 10})
        # Arrow
        colorVal = scalarMap.to_rgba(result.loc[i].mae)

        ax.arrow(X_lens_test[i, 0], X_lens_test[i, 1],
                  X_tsne[i, 0] - X_lens_test[i, 0] , X_tsne[i, 1] - X_lens_test[i, 1],
                  alpha=0.4, length_includes_head=True,
                  facecolor=colorVal,
                  edgecolor=colorVal)

ax.set_title("Predicting t-SNE transformations with GBM VS TSNE errors >= 0.8 quantile")
cb1 = mpl.colorbar.ColorbarBase(axc, cmap=cmap, norm=cNorm,orientation='vertical')
	# Author: HJ van Veen <info@mlwave.com>
	# Description: Experiment to learn a tSNE transformer for new
	# test data with a multi-output GBM
	#
	# Idea first seen at lvdmaaten.github.io/tsne
	# > [...] it is not possible to embed test points in an existing
	# > map [...]
	# > A potential approach to deal with this would be to train
	# > a multivariate regressor to predict the map location from
	# > the input data.
	#
	# Part of code adapted from Fabian Pedregosa, Olivier Grisel,
	# Mathieu Blondel, Gael Varoquaux,
	# originally licensed under "BSD 3 clause (C) INRIA 2011".

	# Original script from @MLWave :https://gist.github.com/MLWave/4a3f8b0fee43d45646cf118bda4d202a

	from sklearn import (manifold, datasets, preprocessing, model_selection,
	decomposition, metrics, multioutput)
	from xgboost import XGBRegressor
	import matplotlib.pyplot as plt
	import numpy as np

	# For data we use 6 different digit classes of 8x8 pixels
	digits = datasets.load_digits(n_class=6)
	X = digits.data # (1083, 64)
	y = digits.target # (1083, )

	# Split the data into 66% train and 33% test set.
	X_train, X_test, y_train, y_test = model_selection.train_test_split(X,
	y,
	test_size=0.33,
	random_state=0)

	# First, PCA 2-D (which has .transform()) to illustrate and evaluate
	lens = decomposition.PCA(n_components=2, random_state=0)
	X_lens_train = lens.fit_transform(X_train)
	X_lens_test = lens.transform(X_test)

	# Normalize the lens within 0-1
	scaler = preprocessing.MinMaxScaler()
	X_lens_train = scaler.fit_transform(X_lens_train)
	X_lens_test = scaler.transform(X_lens_test)

	# Fit a model and predict the lens values from the original features
	model = XGBRegressor(n_estimators=2000, max_depth=20, learning_rate=0.01)
	model = multioutput.MultiOutputRegressor(model)
	model.fit(X_train, X_lens_train)
	preds = model.predict(X_test)

	# Evaluate exhaustively
	print("PREDICTION\t\tGROUND TRUTH")
	for p, g in zip(preds, X_lens_test):
	print(p, g)
	print("MAE", metrics.mean_absolute_error(X_lens_test, preds))

	# Now TSNE (which has no .transform()) and a visual evaluation
	lens = manifold.TSNE(n_components=2, init='pca', random_state=0)
	X_lens_train = lens.fit_transform(X_train)

	# Normalize the lens within 0-1
	X_lens_train = scaler.fit_transform(X_lens_train)

	# Fit a model and predict the lens values from the original features
	model.fit(X_train, X_lens_train)
	X_tsne = model.predict(X_test)


	# New part
	import pandas as pd

	result = pd.DataFrame({'x_pred' : X_tsne[:, 0], 'y_pred' : X_tsne[:, 1],
	'x_true' : X_lens_test[:, 0], 'y_true' : X_lens_test[:, 1]})

	result = result[['x_true', 'x_pred', 'y_true', 'y_pred']]

	# Calcul errors (MAE) for each point :
	result['mae'] = result.apply(lambda row: metrics.mean_absolute_error([row.x_true, row.y_true],
	[row.x_pred, row.y_pred]), axis=1)

	# Visualize error and direction
	import matplotlib.colors as colors
	import matplotlib.cm as cmx
	import matplotlib as mpl


	mae_q_8 = result.mae.quantile(0.8)

	cmap = plt.cm.viridis

	cNorm = colors.Normalize(vmin=result.mae.quantile(0.70), vmax=result.mae.max())

	scalarMap = cmx.ScalarMappable(norm=cNorm,cmap=cmap)

	fig = plt.figure()

	ax = fig.add_axes([0.1, 0.1, 0.7, 0.85])
	axc = fig.add_axes([0.85, 0.10, 0.05, 0.85])

	for i in range(X_tsne.shape[0]):
	# We take only error >= quantile 0.8 MAE
	if result.loc[i].mae >= mae_q_8:
	ax.text(X_lens_test[i, 0], X_lens_test[i, 1], str(y_test[i]),
	color=plt.cm.Set1(y_test[i] / 7.),
	fontdict={'weight': 'bold', 'size': 8})
	ax.text(X_tsne[i, 0], X_tsne[i, 1], str(y_test[i]),
	color=plt.cm.Set1(y_test[i] / 7.3),
	fontdict={'weight': 'bold', 'size': 10})
	# Arrow
	colorVal = scalarMap.to_rgba(result.loc[i].mae)

	ax.arrow(X_lens_test[i, 0], X_lens_test[i, 1],
	X_tsne[i, 0] - X_lens_test[i, 0] , X_tsne[i, 1] - X_lens_test[i, 1],
	alpha=0.4, length_includes_head=True,
	facecolor=colorVal,
	edgecolor=colorVal)

	ax.set_title("Predicting t-SNE transformations with GBM VS TSNE errors >= 0.8 quantile")
	cb1 = mpl.colorbar.ColorbarBase(axc, cmap=cmap, norm=cNorm,orientation='vertical')