Skip to content

Instantly share code, notes, and snippets.

@ncalm
Created April 14, 2022 23:12
Show Gist options
  • Save ncalm/3e9d97f77e36f3c50e8c8bc60068dfbd to your computer and use it in GitHub Desktop.
Save ncalm/3e9d97f77e36f3c50e8c8bc60068dfbd to your computer and use it in GitHub Desktop.
This Excel LAMBDA function creates a table of summary statistics for an array of data
/*
DESCRIBE
Creates tables of statistics similar to the Analysis Toolpak Add-in "Summary statistics" feature.
Including support for text columns and additional statistics for numeric columns.
Inputs
- data: a range or array of data with at least one column
- has_header: TRUE if data includes a header row, FALSE otherwise
Returns
An array with COLUMNS(data) columns and ROWS(_stats) rows (see below definition of _stats).
The first column is the row header, which is the description of the statistic (i.e. INDEX(_stats,r,1))
The first row is the column header, which is either the column header of `data` if has_header=TRUE,
or Column1, Column2, ..., Column[COLUMNS(data)] if has_header=FALSE.
Columns containing numbers will show statistics for "Sum" through "Maximum".
Columns containing all text will show statistics for "Distinct count" through "Least common text".
Dependencies
RECURSIVEFILTER
https://gist.github.com/ncalm/f1c7b5bd8fe86528da88798b545053c8
GROUPAGGREGATE
https://gist.github.com/ncalm/29c473a940997c3b112bf06e7e71cf15
Note: the "pretty" formatted version will NOT paste into the Name Manager due to length limits.
If pasting into Name Manager, use the compact version at the bottom of this file.
DESCRIBE =LAMBDA(data,has_header,
LET(
rng,IF(has_header,INDEX(data,2,1):INDEX(data,ROWS(data),COLUMNS(data)),data),
_header,IF(
has_header,
INDEX(data,1,1):INDEX(data,1,COLUMNS(data)),
"Column "&SEQUENCE(1,COLUMNS(data))
),
_stats,
{"Statistic";
"Sum";
"Mean";
"Count";
"Mode";
"Standard Deviation";
"Sample Variance";
"Standard Error";
"Kurtosis";
"Skewness";
"Confidence Level(95.0%)";
"1st quartile";
"Median";
"3rd quartile";
"Range";
"Distinct count";
"Rows";
"Minimum";
"Maximum";
"Length of longest text";
"Length of shortest text";
"Most common text";
"Least common text"},
MAKEARRAY(
COUNTA(_stats),
COLUMNS(rng)+1,
LAMBDA(r,c,
IFS(
c=1,
INDEX(_stats,r,1),
r=1,
INDEX(_header,1,c-1),
TRUE,
LET(
_m,INDEX(rng,,c-1),
_istxt,ISTEXT(_m),
_hastxt,OR(_istxt),
_cnt,COUNT(_m),
_alltxt,SUM(N(_istxt))=ROWS(_m),
_mean,AVERAGE(_m),
_median,MEDIAN(_m),
_stddev,STDEV.S(_m),
_stderr,_stddev/SQRT(_cnt),
_mode,MODE.SNGL(_m),
_var_s,VAR.S(_m),
_kurt,KURT(_m),
_skew,SKEW(_m),
_max,MAX(_m),
_min,MIN(_m),
_range,_max-_min,
_sum,SUM(_m),
_conf,CONFIDENCE.T(0.05,_stddev,_cnt),
_q_one,QUARTILE.EXC(_m,1),
_q_three,QUARTILE.EXC(_m,3),
_rows,ROWS(_m),
_txtm,FILTER(_m,_istxt),
_tfreqs,IF(_alltxt,GROUPAGGREGATE(CHOOSE({1,2},_txtm,_txtm),{"group","counta"}),#N/A),
_tvals,INDEX(_tfreqs,,1),
_tcounts,INDEX(_tfreqs,,2),
_dcount,COUNTA(UNIQUE(_m)),
_long,IF(_hastxt,MAX(LEN(_txtm)),#N/A),
_short,IF(_hastxt,MIN(LEN(_txtm)),#N/A),
_mosttxt,TEXTJOIN(
",",
TRUE,
INDEX(
_tvals,
XMATCH(MAX(_tcounts),_tcounts),
1
)
)&" ("&MAX(_tcounts)&")",
_leasttxt,TEXTJOIN(
",",
TRUE,
INDEX(
_tvals,
XMATCH(MIN(_tcounts),_tcounts),
1
)
)&" ("&MIN(_tcounts)&")",
_mintxt,INDEX(SORT(_tvals),1),
_maxtxt,INDEX(SORT(_tvals,,-1),1),
IF(
AND(_alltxt,r<16),
#N/A,
CHOOSE(
r-1,
_sum,
_mean,
_cnt,
_mode,
_stddev,
_var_s,
_stderr,
_kurt,
_skew,
_conf,
_q_one,
_median,
_q_three,
_range,
_dcount,
_rows,
IF(_alltxt,_mintxt,_min),
IF(_alltxt,_maxtxt,_max),
_long,
_short,
_mosttxt,
_leasttxt
)
)
)
)
)
)
)
);
*/
DESCRIBE =LAMBDA(data,has_header,LET(rng,IF(has_header,INDEX(data,2,1):INDEX(data,ROWS(data),COLUMNS(data)),data),_header,IF(has_header,INDEX(data,1,1):INDEX(data,1,COLUMNS(data)),"Column"&SEQUENCE(1,COLUMNS(data))),_stats,{"Statistic";"Sum";"Mean";"Count";"Mode";"Standard Deviation";"Sample Variance";"Standard Error";"Kurtosis";"Skewness";"Confidence Level (95.0%)";"1st quartile";"Median";"3rd quartile";"Range";"Distinct count";"Rows";"Minimum";"Maximum";"Length of longest text";"Length of shortest text";"Most common text";"Least common text"},MAKEARRAY(COUNTA(_stats),COLUMNS(rng)+1,LAMBDA(r,c,IFS(c=1,INDEX(_stats,r,1),r=1,INDEX(_header,1,c-1),TRUE,LET(_m,INDEX(rng,,c-1),_istxt,ISTEXT(_m),_hastxt,OR(_istxt),_cnt,COUNT(_m),_alltxt,SUM(N(_istxt))=ROWS(_m),_mean,AVERAGE(_m),_median,MEDIAN(_m),_stddev,STDEV.S(_m),_stderr,_stddev/SQRT(_cnt),_mode,MODE.SNGL(_m),_var_s,VAR.S(_m),_kurt,KURT(_m),_skew,SKEW(_m),_max,MAX(_m),_min,MIN(_m),_range,_max-_min,_sum,SUM(_m),_conf,CONFIDENCE.T(0.05,_stddev,_cnt),_q_one,QUARTILE.EXC(_m,1),_q_three,QUARTILE.EXC(_m,3),_rows,ROWS(_m),_txtm,FILTER(_m,_istxt),_tfreqs,IF(_alltxt,GROUPAGGREGATE(CHOOSE({1,2},_txtm,_txtm),{"group","counta"}),#N/A),_tvals,INDEX(_tfreqs,,1),_tcounts,INDEX(_tfreqs,,2),_dcount,COUNTA(UNIQUE(_m)),_long,IF(_hastxt,MAX(LEN(_txtm)),#N/A),_short,IF(_hastxt,MIN(LEN(_txtm)),#N/A),_mosttxt,TEXTJOIN(",",TRUE,INDEX(_tvals,XMATCH(MAX(_tcounts),_tcounts),1))&" ("&MAX(_tcounts)&")",_leasttxt,TEXTJOIN(",",TRUE,INDEX(_tvals,XMATCH(MIN(_tcounts),_tcounts),1))&" ("&MIN(_tcounts)&")",_mintxt,INDEX(SORT(_tvals),1),_maxtxt,INDEX(SORT(_tvals,,-1),1),IF(AND(_alltxt,r<16),#N/A,CHOOSE(r-1,_sum,_mean,_cnt,_mode,_stddev,_var_s,_stderr,_kurt,_skew,_conf,_q_one,_median,_q_three,_range,_dcount,_rows,IF(_alltxt,_mintxt,_min),IF(_alltxt,_maxtxt,_max),_long,_short,_mosttxt,_leasttxt))))))));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment