ncalm/excel-lambda-module-knn.txt

## excel-lambda-module-knn.txt
Imputer = LAMBDA(training_data, k, [distance_function],
    LAMBDA(observation,
        LET(
            // Identify where the missing value is on the observation
            _missing, IFERROR(observation="",TRUE),
            IF(
                /*If there's more than one blank/error in the observation or
                if the training data and observation have difference column counts,
                then return an error
                */
                OR(SUM(--_missing)<>1, COLUMNS(training_data)<>COLUMNS(observation)), #VALUE!,
                LET(
                    // Remove the missing value from the observation
                    _observation, FILTER(observation, NOT(_missing)),
                    // Handle omitted distance function, apply first argument (_observation)
                    _distance_function, IF(ISOMITTED(distance_function), KNN.EUCLIDEAN(_observation), distance_function(_observation)),
                    // Remove the missing value's feature from the training data
                    _training_data, FILTER(training_data, NOT(_missing)),
                    // The missing value's feature from the training data
                    _Y, FILTER(training_data, _missing),
                    // Calculate the distances between the training data and the observation
                    _distances, BYROW(_training_data, _distance_function),
                    // Get the k-nearest neighbors
                    _knn, FILTER(_Y, _distances <= SMALL(_distances, k)),
                    // Take the mean of the distances
                    _output, AVERAGE(_knn),
                    // Place the imputed value back in the observation
                    observation+(_missing*_output)
                )
            )
        )
    )
);

// Wrapper for KNN.Impute that transforms and returns the entire array with
// training_data stacked on top of the observations with missing values
ImputeTransform = LAMBDA(training_data, k, observations, [distance_function],
    LET(
        // Create an imputer using the training data, k and the given distance function
        imputer, KNN.Imputer(training_data,k,distance_function),
        // Transform the original array by imputing the missing values.
        // The assumption here is that the data with the missing values is stacked underneath
        // The training data
        REDUCE(
            training_data,
            SEQUENCE(ROWS(observations)),
            LAMBDA(a,b,VSTACK(a,imputer(CHOOSEROWS(observations,b))))
        )
    )
);


/*
Distance functions. Default used above is EUCLIDEAN.
*/
MINKOWSKI = LAMBDA(p,LAMBDA(x, LAMBDA(y, POWER(SUM(ABS((x-y)^p)),1/p))));
EUCLIDEAN = LAMBDA(x, LAMBDA(y,MINKOWSKI(2)(x)(y)));
EUCLIDEAN_SQ = LAMBDA(x, LAMBDA(y, EUCLIDEAN(x)(y)^2));
MANHATTAN = LAMBDA(x, LAMBDA(y,MINKOWSKI(1)(x)(y)));
CHEBYSHEV = LAMBDA(x, LAMBDA(y,MAX(ABS(x-y))));
	Imputer = LAMBDA(training_data, k, [distance_function],
	LAMBDA(observation,
	LET(
	// Identify where the missing value is on the observation
	_missing, IFERROR(observation="",TRUE),
	IF(
	/*If there's more than one blank/error in the observation or
	if the training data and observation have difference column counts,
	then return an error
	*/
	OR(SUM(--_missing)<>1, COLUMNS(training_data)<>COLUMNS(observation)), #VALUE!,
	LET(
	// Remove the missing value from the observation
	_observation, FILTER(observation, NOT(_missing)),
	// Handle omitted distance function, apply first argument (_observation)
	_distance_function, IF(ISOMITTED(distance_function), KNN.EUCLIDEAN(_observation), distance_function(_observation)),
	// Remove the missing value's feature from the training data
	_training_data, FILTER(training_data, NOT(_missing)),
	// The missing value's feature from the training data
	_Y, FILTER(training_data, _missing),
	// Calculate the distances between the training data and the observation
	_distances, BYROW(_training_data, _distance_function),
	// Get the k-nearest neighbors
	_knn, FILTER(_Y, _distances <= SMALL(_distances, k)),
	// Take the mean of the distances
	_output, AVERAGE(_knn),
	// Place the imputed value back in the observation
	observation+(_missing*_output)
	)
	)
	)
	)
	);

	// Wrapper for KNN.Impute that transforms and returns the entire array with
	// training_data stacked on top of the observations with missing values
	ImputeTransform = LAMBDA(training_data, k, observations, [distance_function],
	LET(
	// Create an imputer using the training data, k and the given distance function
	imputer, KNN.Imputer(training_data,k,distance_function),
	// Transform the original array by imputing the missing values.
	// The assumption here is that the data with the missing values is stacked underneath
	// The training data
	REDUCE(
	training_data,
	SEQUENCE(ROWS(observations)),
	LAMBDA(a,b,VSTACK(a,imputer(CHOOSEROWS(observations,b))))
	)
	)
	);


	/*
	Distance functions. Default used above is EUCLIDEAN.
	*/
	MINKOWSKI = LAMBDA(p,LAMBDA(x, LAMBDA(y, POWER(SUM(ABS((x-y)^p)),1/p))));
	EUCLIDEAN = LAMBDA(x, LAMBDA(y,MINKOWSKI(2)(x)(y)));
	EUCLIDEAN_SQ = LAMBDA(x, LAMBDA(y, EUCLIDEAN(x)(y)^2));
	MANHATTAN = LAMBDA(x, LAMBDA(y,MINKOWSKI(1)(x)(y)));
	CHEBYSHEV = LAMBDA(x, LAMBDA(y,MAX(ABS(x-y))));