Skip to content

Instantly share code, notes, and snippets.

@glemaitre
Created April 7, 2017 13:42
Show Gist options
  • Save glemaitre/bc51bcd93b4bd4cc960734be9a41dfa2 to your computer and use it in GitHub Desktop.
Save glemaitre/bc51bcd93b4bd4cc960734be9a41dfa2 to your computer and use it in GitHub Desktop.
"""
This is real case using the data of the Adult Census dataset available at:
https://archive.ics.uci.edu/ml/datasets/Adult
It will show that adding a smoothing noise do not has any influence on the
classification performance but allow for a better understanding when manually
checking the QuantileTransformer.
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
N_QUANTILES = 1000
usecols = (0, 2, 4, 10, 11, 12, 14)
data = pd.read_csv('adult.data', usecols=usecols)
X = data.iloc[:, :-1]
lc = LabelEncoder()
y = lc.fit_transform(data.iloc[:, -1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33,
random_state=42)
pipeline_notrans = make_pipeline(LogisticRegression(random_state=0))
pipeline_trans = make_pipeline(
QuantileTransformer(n_quantiles=N_QUANTILES,
output_distribution="uniform",
random_state=42),
LogisticRegression(random_state=0))
pipeline_trans_noise = make_pipeline(
QuantileTransformer(n_quantiles=N_QUANTILES,
output_distribution="uniform",
smoothing_noise=1e-12,
random_state=42),
LogisticRegression(random_state=0))
print('LR classification: score = {}'.format(
pipeline_notrans.fit(X_train, y_train).score(X_test, y_test)))
print('Transformer without smoothing noise + LR classification: '
' score = {}'.format(
pipeline_trans.fit(X_train, y_train).score(X_test, y_test)))
print('Transformer with smoothing noise + LR classification: '
' score = {}'.format(
pipeline_trans_noise.fit(X_train, y_train).score(X_test, y_test)))
qt_trans = pipeline_trans.named_steps['quantiletransformer']
qt_trans_noise = pipeline_trans_noise.named_steps['quantiletransformer']
f, axarr = plt.subplots(3, 2)
axarr = np.ravel(axarr)
for quantile, quantile_noise, ax in zip(qt_trans.quantiles_.T,
qt_trans_noise.quantiles_.T,
axarr):
c0 = ax.plot(quantile, np.linspace(0, 1, N_QUANTILES),
'b--', label='Without noise')
c1 = ax.plot(quantile_noise, np.linspace(0, 1, N_QUANTILES),
'r:', label='With noise')
# make nice plotting
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['left'].set_position(('outward', 10))
ax.spines['bottom'].set_position(('outward', 10))
ax.set_xlabel('Features value')
ax.set_ylabel('Associated quantiles')
ax.legend(loc="lower right")
plt.tight_layout()
# the features #6 --- numbers of hours per week --- seems to be a candidate to
# illustrate of using a smoothing noise to interpret some results
# create a typical feature to be transformed
X1 = np.reshape([50, 30000, 10, 10000, 2000, 40], (1, -1))
X1t = qt_trans.transform(X1)
X1t_noise = qt_trans_noise.transform(X1)
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(qt_trans.quantiles_.T[5], np.linspace(0, 1, num=N_QUANTILES))
ax.scatter(X1[0, 5], X1t[0, 5], c='r',
label=r'Not smoothed -> $f({0}) = {1:.2f}$'.format(X1[0, 5],
X1t[0, 5]))
ax.scatter(X1[0, 5], X1t_noise[0, 5], c='g',
label=r'Smoothed -> $f({0}) = {1:.2f}$'.format(X1[0, 5],
X1t_noise[0, 5]))
# make nice plotting
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
ax.spines['left'].set_position(('outward', 10))
ax.spines['bottom'].set_position(('outward', 10))
ax.set_xlabel('Features value')
ax.set_ylabel('Associated quantiles')
ax.legend(loc="lower right")
ax.set_ylim([0, 1])
ax.set_xlim([0, 100])
ax.set_title('Number of hours worked per week')
plt.tight_layout()
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment