ncalm/excel-lambda-OUTLIERS.txt

## excel-lambda-OUTLIERS.txt
/*
OUTLIERS

These lambda functions allow us to quickly apply the standard deviation test to a series of transformed variables

Included in this file:

OUTLIER.THRESHOLDS - for calculating outlier thresholds using a standard deviation test

inputs:
  - data, a single column array of a continuous variable
  - std_devs, the number of standard deviations from the mean of data you want to return as thresholds

returns:
  a single row, two-column array containing the lower and upper thresholds below or above which we might consider the value in data to be an outlier


OUTLIER.TEST - for comparing a variable with outlier thresholds and returning flagged outliers

inputs:
  - data - a single column array of a continuous variable, usually transformed via sqrt, ln, log or similar
  - std_devs - the number of standard deviations from the mean of data you want to return as thresholds, to be passed into OUTLIER.THRESHOLDS
  - [prefix] - a column header prefix to prepend to each column of the output array

returns:
  A three column array with the ROWS(data) rows and one header row.
  - column 1 contains the transformed data passed into data
  - column 2 contains TRUE if the row is outside of the calculated thresholds, FALSE otherwise
  - column 3 contains either Low or High for rows where column 2 is TRUE

OUTLIER.TESTS - for applying OUTLIER.TEST on multiple transformations at once

inputs:
  - data - the single column array of continuous data before applying any transformations
  - std_devs - the number of standard deviations from the mean of data you want to return as thresholds, to be passed into OUTLIER.TEST and subsequently to OUTLIER.THRESHOLDS
  - [transforms] - a list of transforms to use in OUTLIER.TEST. The default and allowed values are {"sqrt","ln","log10"}

returns:
  An array with ROWS(data)+1 rows and 1+COLUMNS(transforms)*3 columns, where the first column is the original data and each subsequent group of three columns represents the result of OUTLIER.TEST when called with each of the transformations provided to [transforms]

For more details:
https://www.flexyourdata.com/blog/excel-lambda-outlier-test-transform-test-and-flag-a-variable-for-outliers-in-excel-with-one-function/

*/

OUTLIER.THRESHOLDS =LAMBDA(data,std_devs,
  LET(
    _data,FILTER(data,NOT(ISERROR(data))),
    _std_devs,std_devs,
    _mean,AVERAGE(_data),
    _std_dev,STDEV.S(_data),
    _lower,_mean-_std_devs*_std_dev,
    _upper,_mean+_std_devs*_std_dev,
    CHOOSE({1,2},_lower,_upper)
  )
);

OUTLIER.TEST=LAMBDA(data,std_devs,[prefix],
  LET(
    _prefix,IF(ISOMITTED(prefix),"test",prefix),
    _thresholds,OUTLIER.THRESHOLDS(data,std_devs),
    _is_outlier,IFERROR(((data<INDEX(_thresholds,1,1))+(data>INDEX(_thresholds,1,2)))>0,FALSE),
    _outlier_type,IFS(
                    data<INDEX(_thresholds,1,1),"Low",
                    data>INDEX(_thresholds,1,2),"High",
                    TRUE,""
                  ),
    _header,_prefix & {"_data","_is_outlier","_outlier_type"},
    _array,
    MAKEARRAY(
      ROWS(data)+1,
      3,
      LAMBDA(r,c,
        IF(
          r=1,INDEX(_header,1,c),
          CHOOSE(
            c,
            INDEX(data,r-1,1),
            INDEX(_is_outlier,r-1,1),
            INDEX(_outlier_type,r-1,1)
          )
        )
      )
    ),
    _array
  )
);

OUTLIER.TESTS =LAMBDA(data,std_devs,[transforms],
  LET(
    _data,SORT(data),
    _std_devs,std_devs,
    _available,{"sqrt","ln","log10"},
    _transforms,LET(
                  t,IF(ISOMITTED(transforms),_available,transforms),
                  FILTER(t,(t="sqrt")+(t="ln")+(t="log10"))
                ),
    _do,IFERROR(XMATCH(_available,_transforms)>0,FALSE),
    _transformed,CHOOSE({1,2,3},SQRT(_data),LN(_data),LOG(_data,10)),
    _do_transformed,FILTER(_transformed,_do),
    _test,LAMBDA(x,y,z,LAMBDA(OUTLIER.TEST(x,y,z))),
    _tests,MAKEARRAY(
            1,
            COLUMNS(_do_transformed),
            LAMBDA(r,c,
              _test(INDEX(_do_transformed,,c),_std_devs,INDEX(_transforms,1,c))
            )
           ),
    _cols,1+COLUMNS(_tests)*3,
    _hdata,MAKEARRAY(ROWS(_data)+1,1,LAMBDA(r,c,IF(r=1,"original_data",INDEX(_data,r-1,1)))),
    _t1,INDEX(_tests,1,1)(),
    _t2,INDEX(_tests,1,2)(),
    _t3,INDEX(_tests,1,3)(),
    _array,
    CHOOSE(
      SEQUENCE(1,_cols),
      _hdata,
      INDEX(_t1,,1),
      INDEX(_t1,,2),
      INDEX(_t1,,3),
      INDEX(_t2,,1),
      INDEX(_t2,,2),
      INDEX(_t2,,3),
      INDEX(_t3,,1),
      INDEX(_t3,,2),
      INDEX(_t3,,3)
    ),
    _array
  )
);
	/*
	OUTLIERS

	These lambda functions allow us to quickly apply the standard deviation test to a series of transformed variables

	Included in this file:

	OUTLIER.THRESHOLDS - for calculating outlier thresholds using a standard deviation test

	inputs:
	- data, a single column array of a continuous variable
	- std_devs, the number of standard deviations from the mean of data you want to return as thresholds

	returns:
	a single row, two-column array containing the lower and upper thresholds below or above which we might consider the value in data to be an outlier


	OUTLIER.TEST - for comparing a variable with outlier thresholds and returning flagged outliers

	inputs:
	- data - a single column array of a continuous variable, usually transformed via sqrt, ln, log or similar
	- std_devs - the number of standard deviations from the mean of data you want to return as thresholds, to be passed into OUTLIER.THRESHOLDS
	- [prefix] - a column header prefix to prepend to each column of the output array

	returns:
	A three column array with the ROWS(data) rows and one header row.
	- column 1 contains the transformed data passed into data
	- column 2 contains TRUE if the row is outside of the calculated thresholds, FALSE otherwise
	- column 3 contains either Low or High for rows where column 2 is TRUE

	OUTLIER.TESTS - for applying OUTLIER.TEST on multiple transformations at once

	inputs:
	- data - the single column array of continuous data before applying any transformations
	- std_devs - the number of standard deviations from the mean of data you want to return as thresholds, to be passed into OUTLIER.TEST and subsequently to OUTLIER.THRESHOLDS
	- [transforms] - a list of transforms to use in OUTLIER.TEST. The default and allowed values are {"sqrt","ln","log10"}

	returns:
	An array with ROWS(data)+1 rows and 1+COLUMNS(transforms)*3 columns, where the first column is the original data and each subsequent group of three columns represents the result of OUTLIER.TEST when called with each of the transformations provided to [transforms]

	For more details:
	https://www.flexyourdata.com/blog/excel-lambda-outlier-test-transform-test-and-flag-a-variable-for-outliers-in-excel-with-one-function/

	*/

	OUTLIER.THRESHOLDS =LAMBDA(data,std_devs,
	LET(
	_data,FILTER(data,NOT(ISERROR(data))),
	_std_devs,std_devs,
	_mean,AVERAGE(_data),
	_std_dev,STDEV.S(_data),
	_lower,_mean-_std_devs*_std_dev,
	_upper,_mean+_std_devs*_std_dev,
	CHOOSE({1,2},_lower,_upper)
	)
	);

	OUTLIER.TEST=LAMBDA(data,std_devs,[prefix],
	LET(
	_prefix,IF(ISOMITTED(prefix),"test",prefix),
	_thresholds,OUTLIER.THRESHOLDS(data,std_devs),
	_is_outlier,IFERROR(((data<INDEX(_thresholds,1,1))+(data>INDEX(_thresholds,1,2)))>0,FALSE),
	_outlier_type,IFS(
	data<INDEX(_thresholds,1,1),"Low",
	data>INDEX(_thresholds,1,2),"High",
	TRUE,""
	),
	_header,_prefix & {"_data","_is_outlier","_outlier_type"},
	_array,
	MAKEARRAY(
	ROWS(data)+1,
	3,
	LAMBDA(r,c,
	IF(
	r=1,INDEX(_header,1,c),
	CHOOSE(
	c,
	INDEX(data,r-1,1),
	INDEX(_is_outlier,r-1,1),
	INDEX(_outlier_type,r-1,1)
	)
	)
	)
	),
	_array
	)
	);

	OUTLIER.TESTS =LAMBDA(data,std_devs,[transforms],
	LET(
	_data,SORT(data),
	_std_devs,std_devs,
	_available,{"sqrt","ln","log10"},
	_transforms,LET(
	t,IF(ISOMITTED(transforms),_available,transforms),
	FILTER(t,(t="sqrt")+(t="ln")+(t="log10"))
	),
	_do,IFERROR(XMATCH(_available,_transforms)>0,FALSE),
	_transformed,CHOOSE({1,2,3},SQRT(_data),LN(_data),LOG(_data,10)),
	_do_transformed,FILTER(_transformed,_do),
	_test,LAMBDA(x,y,z,LAMBDA(OUTLIER.TEST(x,y,z))),
	_tests,MAKEARRAY(
	1,
	COLUMNS(_do_transformed),
	LAMBDA(r,c,
	_test(INDEX(_do_transformed,,c),_std_devs,INDEX(_transforms,1,c))
	)
	),
	_cols,1+COLUMNS(_tests)*3,
	_hdata,MAKEARRAY(ROWS(_data)+1,1,LAMBDA(r,c,IF(r=1,"original_data",INDEX(_data,r-1,1)))),
	_t1,INDEX(_tests,1,1)(),
	_t2,INDEX(_tests,1,2)(),
	_t3,INDEX(_tests,1,3)(),
	_array,
	CHOOSE(
	SEQUENCE(1,_cols),
	_hdata,
	INDEX(_t1,,1),
	INDEX(_t1,,2),
	INDEX(_t1,,3),
	INDEX(_t2,,1),
	INDEX(_t2,,2),
	INDEX(_t2,,3),
	INDEX(_t3,,1),
	INDEX(_t3,,2),
	INDEX(_t3,,3)
	),
	_array
	)
	);