Skip to content

Instantly share code, notes, and snippets.

@cheeseonamonkey
Last active March 6, 2024 00:03
Show Gist options
  • Save cheeseonamonkey/20e610ec9b0e9ec2450442fe41de6c98 to your computer and use it in GitHub Desktop.
Save cheeseonamonkey/20e610ec9b0e9ec2450442fe41de6c98 to your computer and use it in GitHub Desktop.
python scalers and transforms cheatsheet
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler, Normalizer
# Generate demonstration data ensuring positivity
better_datasets = [
np.random.rand(100, 2) * 20 + 10,
np.random.rand(100, 2) * 100,
np.random.beta(100, 2, size=(100, 2)),
np.random.f(100, 2, size=(100, 2)),
np.random.normal(100, 2, size=(100, 2)),
np.random.binomial(100, 0.7, size=(100, 2)),
np.random.gamma(100, 2, size=(100, 2)),
np.random.exponential(10, size=(100, 2)) + 1, # Adding 1 to ensure positivity
np.random.uniform(low=1, high=60, size=(100, 2)) # Ensuring positivity
]
# Define scalers
scalers = {
'Standard Scaler': StandardScaler(),
'Min-Max Scaler': MinMaxScaler(),
'Robust Scaler': RobustScaler(),
'MaxAbs Scaler': MaxAbsScaler(),
'Normalizer': Normalizer() # Adding Normalizer scaler
}
plt.figure(figsize=(15, 10)) # Adjusted figsize
# Plot histograms for all scalers for each dataset
for dataset_index, dataset in enumerate(better_datasets):
for i, (scaler_name, scaler) in enumerate(scalers.items(), start=1):
ax = plt.subplot(10, len(scalers), dataset_index * len(scalers) + i)
scaled_data = scaler.fit_transform(dataset)
ax.hist(scaled_data[:, 0], bins=20, alpha=0.75, color='darkred', label='Feature 1')
ax.hist(scaled_data[:, 1], bins=20, alpha=0.75, color='darkblue', label='Feature 2')
ax.grid(True)
if dataset_index == 0:
ax.set_title(scaler_name, fontsize=12, fontweight='bold')
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout(w_pad=0.05, h_pad=0.3)
plt.show()
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import boxcox
from sklearn.preprocessing import PowerTransformer
# Generate demonstration data ensuring positivity
better_datasets = [
np.random.rand(100, 2) * 20 + 10,
np.random.rand(100, 2) * 100,
np.random.beta(100, 2, size=(100, 2)),
np.random.f(100, 2, size=(100, 2)),
np.random.normal(100, 2, size=(100, 2)),
np.random.binomial(100, 0.7, size=(100, 2)),
np.random.gamma(100, 2, size=(100, 2)),
np.random.exponential(10, size=(100, 2)) + 1, # Adding 1 to ensure positivity
np.random.uniform(low=1, high=60, size=(100, 2)) # Ensuring positivity
]
# Define transformers
transformers = {
'Log Transformation': np.log1p,
'Square Root Transformation': np.sqrt,
'Exponential Transformation': np.exp,
'Box-Cox Transformation': boxcox,
'Yeo-Johnson Transformation': PowerTransformer(method='yeo-johnson').fit
}
plt.figure(figsize=(12, 8))
# Plot histograms for all transformers for each dataset
for dataset_index, dataset in enumerate(better_datasets):
for i, (transformer_name, transformer) in enumerate(transformers.items(), start=1):
ax = plt.subplot(10, len(transformers), dataset_index * len(transformers) + i)
if transformer_name == 'Box-Cox Transformation':
transformed_data, _ = boxcox(dataset.flatten())
transformed_data = transformed_data.reshape(dataset.shape)
elif transformer_name == 'Yeo-Johnson Transformation':
transformer_instance = PowerTransformer(method='yeo-johnson').fit(dataset)
transformed_data = transformer_instance.transform(dataset)
else:
transformed_data = transformer(dataset)
ax.hist(transformed_data[:, 0], bins=20, alpha=0.75, color='darkred', label='Feature 1')
ax.hist(transformed_data[:, 1], bins=20, alpha=0.75, color='darkblue', label='Feature 2')
ax.grid(True)
if dataset_index == 0:
ax.set_title(transformer_name, fontsize=12, fontweight='bold')
ax.set_xticks([])
ax.set_yticks([])
plt.tight_layout(w_pad=0.05, h_pad=0.3)
plt.show()
@cheeseonamonkey
Copy link
Author

image
image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment