Skip to content

Instantly share code, notes, and snippets.

@oscar-defelice
Created March 16, 2021 14:24
Show Gist options
  • Save oscar-defelice/a376c6e81bcf7c35e73d3ad1e23e043b to your computer and use it in GitHub Desktop.
Save oscar-defelice/a376c6e81bcf7c35e73d3ad1e23e043b to your computer and use it in GitHub Desktop.
import numpy as np
from sklearn.model_selection import train_test_split
config = {
'data': data,
'train_test_ratio': 0.2
}
def feature_selection(data):
"""
feature_selection function.
It takes data array and returns the feature selected.
Arguments:
data np.array of shape (n_training_example, n_features)
Returns:
data_reduced np.array of shape (n_training_example, n_reduced_features)
"""
data_reduced = data # In this case we keep all the features. Change this according to your analysis.
assert data_reduced.shape[0] == data.shape[0], "Data leaking!"
return data_reduced
def feature_normalisation(data):
"""
feature_normalisation function.
It takes data array and returns it with feature normalised.
Arguments:
data np.array of shape (n_training_example, n_features)
Returns:
data_normalised np.array of shape (n_training_example, n_features)
"""
data_normalised = data
mean = data_normalised.mean(axis=0)
data_normalised -= mean
std = data_normalised.std(axis=0)
data_normalised /= std
assert data_normalised.shape == data.shape, "Data leaking!"
return data_normalised
def import_data(input=config):
"""
import_data function.
It makes use of sklearn.model_selection.train_test_split.
Arguments:
input dict containing the following variables
data dict of np.arrays
data.data is the array made of feature vectors rows.
data.target is the array of target values.
train_test_ratio float
the ratio between train and test set sizes.
default: 0.2
Returns:
tuple of four np.arrays (X_train, X_test, Y_train, Y_test) of shape
- X_train (n_training_examples, n_features)
- X_test (n_test_examples, n_features)
- Y_train (n_training_examples, )
- Y_test (n_test_examples, )
"""
data, train_test_ratio = input['data'], input['train_test_ratio']
X = feature_selection(data.data)
X = feature_normalisation(X)
Y = data.target
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = train_test_ratio, random_state=42)
assert X_train.shape[1] == X_test.shape[1], "Train and test shapes do not correspond!"
return X_train, X_test, Y_train, Y_test
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment