Last active
July 22, 2024 15:24
-
-
Save ncalm/14b71242f453614fefe452e36df1023e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Imputer = LAMBDA(training_data, k, [distance_function], | |
LAMBDA(observation, | |
LET( | |
// Identify where the missing value is on the observation | |
_missing, IFERROR(observation="",TRUE), | |
IF( | |
/*If there's more than one blank/error in the observation or | |
if the training data and observation have difference column counts, | |
then return an error | |
*/ | |
OR(SUM(--_missing)<>1, COLUMNS(training_data)<>COLUMNS(observation)), #VALUE!, | |
LET( | |
// Remove the missing value from the observation | |
_observation, FILTER(observation, NOT(_missing)), | |
// Handle omitted distance function, apply first argument (_observation) | |
_distance_function, IF(ISOMITTED(distance_function), KNN.EUCLIDEAN(_observation), distance_function(_observation)), | |
// Remove the missing value's feature from the training data | |
_training_data, FILTER(training_data, NOT(_missing)), | |
// The missing value's feature from the training data | |
_Y, FILTER(training_data, _missing), | |
// Calculate the distances between the training data and the observation | |
_distances, BYROW(_training_data, _distance_function), | |
// Get the k-nearest neighbors | |
_knn, FILTER(_Y, _distances <= SMALL(_distances, k)), | |
// Take the mean of the distances | |
_output, AVERAGE(_knn), | |
// Place the imputed value back in the observation | |
observation+(_missing*_output) | |
) | |
) | |
) | |
) | |
); | |
// Wrapper for KNN.Impute that transforms and returns the entire array with | |
// training_data stacked on top of the observations with missing values | |
ImputeTransform = LAMBDA(training_data, k, observations, [distance_function], | |
LET( | |
// Create an imputer using the training data, k and the given distance function | |
imputer, KNN.Imputer(training_data,k,distance_function), | |
// Transform the original array by imputing the missing values. | |
// The assumption here is that the data with the missing values is stacked underneath | |
// The training data | |
REDUCE( | |
training_data, | |
SEQUENCE(ROWS(observations)), | |
LAMBDA(a,b,VSTACK(a,imputer(CHOOSEROWS(observations,b)))) | |
) | |
) | |
); | |
/* | |
Distance functions. Default used above is EUCLIDEAN. | |
*/ | |
MINKOWSKI = LAMBDA(p,LAMBDA(x, LAMBDA(y, POWER(SUM(ABS((x-y)^p)),1/p)))); | |
EUCLIDEAN = LAMBDA(x, LAMBDA(y,MINKOWSKI(2)(x)(y))); | |
EUCLIDEAN_SQ = LAMBDA(x, LAMBDA(y, EUCLIDEAN(x)(y)^2)); | |
MANHATTAN = LAMBDA(x, LAMBDA(y,MINKOWSKI(1)(x)(y))); | |
CHEBYSHEV = LAMBDA(x, LAMBDA(y,MAX(ABS(x-y)))); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment