glemaitre/adult_qt.py

## adult_qt.py
"""
This is real case using the data of the Adult Census dataset available at:
https://archive.ics.uci.edu/ml/datasets/Adult

It will show that adding a smoothing noise do not has any influence on the
classification performance but allow for a better understanding when manually
checking the QuantileTransformer.
"""
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

N_QUANTILES = 1000

usecols = (0, 2, 4, 10, 11, 12, 14)
data = pd.read_csv('adult.data', usecols=usecols)
X = data.iloc[:, :-1]
lc = LabelEncoder()
y = lc.fit_transform(data.iloc[:, -1])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33,
                                                    random_state=42)

pipeline_notrans = make_pipeline(LogisticRegression(random_state=0))
pipeline_trans = make_pipeline(
    QuantileTransformer(n_quantiles=N_QUANTILES,
                        output_distribution="uniform",
                        random_state=42),
    LogisticRegression(random_state=0))
pipeline_trans_noise = make_pipeline(
    QuantileTransformer(n_quantiles=N_QUANTILES,
                        output_distribution="uniform",
                        smoothing_noise=1e-12,
                        random_state=42),
    LogisticRegression(random_state=0))

print('LR classification: score = {}'.format(
    pipeline_notrans.fit(X_train, y_train).score(X_test, y_test)))
print('Transformer without smoothing noise + LR classification: '
      ' score = {}'.format(
          pipeline_trans.fit(X_train, y_train).score(X_test, y_test)))
print('Transformer with smoothing noise + LR classification: '
      ' score = {}'.format(
          pipeline_trans_noise.fit(X_train, y_train).score(X_test, y_test)))

qt_trans = pipeline_trans.named_steps['quantiletransformer']
qt_trans_noise = pipeline_trans_noise.named_steps['quantiletransformer']

f, axarr = plt.subplots(3, 2)
axarr = np.ravel(axarr)
for quantile, quantile_noise, ax in zip(qt_trans.quantiles_.T,
                                        qt_trans_noise.quantiles_.T,
                                        axarr):
    c0 = ax.plot(quantile, np.linspace(0, 1, N_QUANTILES),
                 'b--', label='Without noise')
    c1 = ax.plot(quantile_noise, np.linspace(0, 1, N_QUANTILES),
                 'r:', label='With noise')

    # make nice plotting
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.spines['left'].set_position(('outward', 10))
    ax.spines['bottom'].set_position(('outward', 10))
    ax.set_xlabel('Features value')
    ax.set_ylabel('Associated quantiles')
    ax.legend(loc="lower right")

plt.tight_layout()

# the features #6 --- numbers of hours per week --- seems to be a candidate to
# illustrate of using a smoothing noise to interpret some results

# create a typical feature to be transformed
X1 = np.reshape([50, 30000, 10, 10000, 2000, 40], (1, -1))
X1t = qt_trans.transform(X1)
X1t_noise = qt_trans_noise.transform(X1)

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

ax.plot(qt_trans.quantiles_.T[5], np.linspace(0, 1, num=N_QUANTILES))
ax.scatter(X1[0, 5], X1t[0, 5], c='r',
           label=r'Not smoothed -> $f({0}) = {1:.2f}$'.format(X1[0, 5],
                                                              X1t[0, 5]))
ax.scatter(X1[0, 5], X1t_noise[0, 5], c='g',
           label=r'Smoothed -> $f({0}) = {1:.2f}$'.format(X1[0, 5],
                                                          X1t_noise[0, 5]))
# make nice plotting
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['left'].set_position(('outward', 10))
ax.spines['bottom'].set_position(('outward', 10))
ax.set_xlabel('Features value')
ax.set_ylabel('Associated quantiles')
ax.legend(loc="lower right")
ax.set_ylim([0, 1])
ax.set_xlim([0, 100])
ax.set_title('Number of hours worked per week')

plt.tight_layout()
plt.show()
	"""
	This is real case using the data of the Adult Census dataset available at:
	https://archive.ics.uci.edu/ml/datasets/Adult

	It will show that adding a smoothing noise do not has any influence on the
	classification performance but allow for a better understanding when manually
	checking the QuantileTransformer.
	"""
	import numpy as np
	import pandas as pd

	import matplotlib.pyplot as plt

	from sklearn.preprocessing import LabelEncoder
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import QuantileTransformer
	from sklearn.linear_model import LogisticRegression
	from sklearn.pipeline import make_pipeline

	N_QUANTILES = 1000

	usecols = (0, 2, 4, 10, 11, 12, 14)
	data = pd.read_csv('adult.data', usecols=usecols)
	X = data.iloc[:, :-1]
	lc = LabelEncoder()
	y = lc.fit_transform(data.iloc[:, -1])

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33,
	random_state=42)

	pipeline_notrans = make_pipeline(LogisticRegression(random_state=0))
	pipeline_trans = make_pipeline(
	QuantileTransformer(n_quantiles=N_QUANTILES,
	output_distribution="uniform",
	random_state=42),
	LogisticRegression(random_state=0))
	pipeline_trans_noise = make_pipeline(
	QuantileTransformer(n_quantiles=N_QUANTILES,
	output_distribution="uniform",
	smoothing_noise=1e-12,
	random_state=42),
	LogisticRegression(random_state=0))

	print('LR classification: score = {}'.format(
	pipeline_notrans.fit(X_train, y_train).score(X_test, y_test)))
	print('Transformer without smoothing noise + LR classification: '
	' score = {}'.format(
	pipeline_trans.fit(X_train, y_train).score(X_test, y_test)))
	print('Transformer with smoothing noise + LR classification: '
	' score = {}'.format(
	pipeline_trans_noise.fit(X_train, y_train).score(X_test, y_test)))

	qt_trans = pipeline_trans.named_steps['quantiletransformer']
	qt_trans_noise = pipeline_trans_noise.named_steps['quantiletransformer']

	f, axarr = plt.subplots(3, 2)
	axarr = np.ravel(axarr)
	for quantile, quantile_noise, ax in zip(qt_trans.quantiles_.T,
	qt_trans_noise.quantiles_.T,
	axarr):
	c0 = ax.plot(quantile, np.linspace(0, 1, N_QUANTILES),
	'b--', label='Without noise')
	c1 = ax.plot(quantile_noise, np.linspace(0, 1, N_QUANTILES),
	'r:', label='With noise')

	# make nice plotting
	ax.spines['top'].set_visible(False)
	ax.spines['right'].set_visible(False)
	ax.get_xaxis().tick_bottom()
	ax.get_yaxis().tick_left()
	ax.spines['left'].set_position(('outward', 10))
	ax.spines['bottom'].set_position(('outward', 10))
	ax.set_xlabel('Features value')
	ax.set_ylabel('Associated quantiles')
	ax.legend(loc="lower right")

	plt.tight_layout()

	# the features #6 --- numbers of hours per week --- seems to be a candidate to
	# illustrate of using a smoothing noise to interpret some results

	# create a typical feature to be transformed
	X1 = np.reshape([50, 30000, 10, 10000, 2000, 40], (1, -1))
	X1t = qt_trans.transform(X1)
	X1t_noise = qt_trans_noise.transform(X1)

	fig = plt.figure()
	ax = fig.add_subplot(1, 1, 1)

	ax.plot(qt_trans.quantiles_.T[5], np.linspace(0, 1, num=N_QUANTILES))
	ax.scatter(X1[0, 5], X1t[0, 5], c='r',
	label=r'Not smoothed -> $f({0}) = {1:.2f}$'.format(X1[0, 5],
	X1t[0, 5]))
	ax.scatter(X1[0, 5], X1t_noise[0, 5], c='g',
	label=r'Smoothed -> $f({0}) = {1:.2f}$'.format(X1[0, 5],
	X1t_noise[0, 5]))
	# make nice plotting
	ax.spines['top'].set_visible(False)
	ax.spines['right'].set_visible(False)
	ax.get_xaxis().tick_bottom()
	ax.get_yaxis().tick_left()
	ax.spines['left'].set_position(('outward', 10))
	ax.spines['bottom'].set_position(('outward', 10))
	ax.set_xlabel('Features value')
	ax.set_ylabel('Associated quantiles')
	ax.legend(loc="lower right")
	ax.set_ylim([0, 1])
	ax.set_xlim([0, 100])
	ax.set_title('Number of hours worked per week')

	plt.tight_layout()
	plt.show()