Skip to content

Instantly share code, notes, and snippets.

@ncalm
Last active July 22, 2024 15:24
Show Gist options
  • Save ncalm/14b71242f453614fefe452e36df1023e to your computer and use it in GitHub Desktop.
Save ncalm/14b71242f453614fefe452e36df1023e to your computer and use it in GitHub Desktop.
Imputer = LAMBDA(training_data, k, [distance_function],
LAMBDA(observation,
LET(
// Identify where the missing value is on the observation
_missing, IFERROR(observation="",TRUE),
IF(
/*If there's more than one blank/error in the observation or
if the training data and observation have difference column counts,
then return an error
*/
OR(SUM(--_missing)<>1, COLUMNS(training_data)<>COLUMNS(observation)), #VALUE!,
LET(
// Remove the missing value from the observation
_observation, FILTER(observation, NOT(_missing)),
// Handle omitted distance function, apply first argument (_observation)
_distance_function, IF(ISOMITTED(distance_function), KNN.EUCLIDEAN(_observation), distance_function(_observation)),
// Remove the missing value's feature from the training data
_training_data, FILTER(training_data, NOT(_missing)),
// The missing value's feature from the training data
_Y, FILTER(training_data, _missing),
// Calculate the distances between the training data and the observation
_distances, BYROW(_training_data, _distance_function),
// Get the k-nearest neighbors
_knn, FILTER(_Y, _distances <= SMALL(_distances, k)),
// Take the mean of the distances
_output, AVERAGE(_knn),
// Place the imputed value back in the observation
observation+(_missing*_output)
)
)
)
)
);
// Wrapper for KNN.Impute that transforms and returns the entire array with
// training_data stacked on top of the observations with missing values
ImputeTransform = LAMBDA(training_data, k, observations, [distance_function],
LET(
// Create an imputer using the training data, k and the given distance function
imputer, KNN.Imputer(training_data,k,distance_function),
// Transform the original array by imputing the missing values.
// The assumption here is that the data with the missing values is stacked underneath
// The training data
REDUCE(
training_data,
SEQUENCE(ROWS(observations)),
LAMBDA(a,b,VSTACK(a,imputer(CHOOSEROWS(observations,b))))
)
)
);
/*
Distance functions. Default used above is EUCLIDEAN.
*/
MINKOWSKI = LAMBDA(p,LAMBDA(x, LAMBDA(y, POWER(SUM(ABS((x-y)^p)),1/p))));
EUCLIDEAN = LAMBDA(x, LAMBDA(y,MINKOWSKI(2)(x)(y)));
EUCLIDEAN_SQ = LAMBDA(x, LAMBDA(y, EUCLIDEAN(x)(y)^2));
MANHATTAN = LAMBDA(x, LAMBDA(y,MINKOWSKI(1)(x)(y)));
CHEBYSHEV = LAMBDA(x, LAMBDA(y,MAX(ABS(x-y))));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment