Skip to content

Instantly share code, notes, and snippets.

@takakishi
Last active February 26, 2024 21:03
Show Gist options
  • Save takakishi/2d36ec78095087cc4ec711637442f22a to your computer and use it in GitHub Desktop.
Save takakishi/2d36ec78095087cc4ec711637442f22a to your computer and use it in GitHub Desktop.

Useful R functions for summary statistics similar to Stata, using {janitor} and {rstatix}.

Libraries

library(dplyr)
library(rstatix)
library(janitor)
library(gapminder)
library(httr)

# Load the custom functions from a GitHub Gist
eval(parse(
  text = content(
    GET("https://gist.githubusercontent.com/takakishi/2d36ec78095087cc4ec711637442f22a/raw/sumstats.R"),
    type = "text",
    encoding = "UTF-8"
  )
))

# Display a snippet of the Gapminder dataset
gapminder
# A tibble: 1,704 × 6
#    country     continent  year lifeExp      pop gdpPercap
#    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
#  1 Afghanistan Asia       1952    28.8  8425333      779.
#  2 Afghanistan Asia       1957    30.3  9240934      821.
#  3 Afghanistan Asia       1962    32.0 10267083      853.
#  4 Afghanistan Asia       1967    34.0 11537966      836.
#  5 Afghanistan Asia       1972    36.1 13079460      740.
#  6 Afghanistan Asia       1977    38.4 14880372      786.
#  7 Afghanistan Asia       1982    39.9 12881816      978.
#  8 Afghanistan Asia       1987    40.8 13867957      852.
#  9 Afghanistan Asia       1992    41.7 16317921      649.
# 10 Afghanistan Asia       1997    41.8 22227415      635.

Basic summary statistics

The sumr() function streamlines summary statistics by offering key measures without the need to constantly set the show argument, addressing the overload of information produced by rstatix::get_summary_stats().

# For the entire dataset
gapminder %>% rstatix::get_summary_stats()
# A tibble: 4 × 13
#   variable      n     min          max    median        q1         q3        iqr       mad       mean          sd       se      ci
#   <chr>     <dbl>   <dbl>        <dbl>     <dbl>     <dbl>      <dbl>      <dbl>     <dbl>      <dbl>       <dbl>    <dbl>   <dbl>
# 1 gdpPercap  1704   241.      113523.     3532.     1202.      9325.      8123.     4008.      7215.       9857.   2.39e+2 4.68e+2
# 2 lifeExp    1704    23.6         82.6      60.7      48.2       70.8       22.6      16.1       59.5        12.9  3.13e-1 6.14e-1
# 3 pop        1704 60011   1318683096   7023596.  2793664   19585222.  16791558.  7841474.  29601212.  106157897.   2.57e+6 5.04e+6
# 4 year       1704  1952         2007      1980.     1966.      1993.        27.5      22.2     1980.         17.3  4.18e-1 8.2 e-1

Generate basic summary statistics for the entire dataset, or filter by specific variables using sumr().

# For the entire dataset
gapminder %>% sumr()
# A tibble: 4 × 7
#   variable      n       mean          sd     min          max    median
#   <chr>     <dbl>      <dbl>       <dbl>   <dbl>        <dbl>     <dbl>
# 1 gdpPercap  1704     7215.       9857.    241.      113523.     3532. 
# 2 lifeExp    1704       59.5        12.9    23.6         82.6      60.7
# 3 pop        1704 29601212.  106157897.  60011   1318683096   7023596. 
# 4 year       1704     1980.         17.3  1952         2007      1980. 
gapminder %>% sumr(lifeExp, pop)
# A tibble: 2 × 7
#   variable     n       mean          sd     min          max    median
#   <chr>    <dbl>      <dbl>       <dbl>   <dbl>        <dbl>     <dbl>
# 1 lifeExp   1704       59.5        12.9    23.6         82.6      60.7
# 2 pop       1704 29601212.  106157897.  60011   1318683096   7023596. 
gapminder %>% 
  dplyr::group_by(continent) %>%
  sumr(lifeExp, pop) %>% 
  dplyr::arrange(variable)
# A tibble: 10 × 8
# Groups:   continent [5]
#    continent variable     n       mean           sd       min          max     median
#    <fct>     <chr>    <dbl>      <dbl>        <dbl>     <dbl>        <dbl>      <dbl>
#  1 Africa    lifeExp    624       48.9         9.15      23.6         76.4       47.8
#  2 Americas  lifeExp    300       64.7         9.35      37.6         80.7       67.0
#  3 Asia      lifeExp    396       60.1        11.9       28.8         82.6       61.8
#  4 Europe    lifeExp    360       71.9         5.43      43.6         81.8       72.2
#  5 Oceania   lifeExp     24       74.3         3.80      69.1         81.2       73.7
#  6 Africa    pop        624  9916003.   15490923.     60011    135031164    4579311  
#  7 Americas  pop        300 24504795.   50979430.    662850    301139947    6227510  
#  8 Asia      pop        396 77038722.  206885205.    120447   1318683096   14530830. 
#  9 Europe    pop        360 17169765.   20519438.    147962     82400996    8551125  
# 10 Oceania   pop         24  8874672.    6506342.   1994794     20434176    6403492. 

Frequency Tables

Comparing dplyr::count() with the custom tab() function for frequency counts.

gapminder %>% dplyr::count(continent)
# A tibble: 5 × 2
#   continent     n
#   <fct>     <int>
# 1 Africa      624
# 2 Americas    300
# 3 Asia        396
# 4 Europe      360
# 5 Oceania      24
gapminder %>% tab(continent)
# continent    n percent
# Africa     624  36.62%
# Americas   300  17.61%
# Asia       396  23.24%
# Europe     360  21.13%
# Oceania     24   1.41%
# Total     1704 100.00%

Cross-Tabulations

For more complex data exploration, janitor::tabyl() simplifies cross-tabulation operations.

gapminder %>% dplyr::count(continent, year)
# A tibble: 60 × 3
#    continent  year     n
#    <fct>     <int> <int>
#  1 Africa     1952    52
#  2 Africa     1957    52
#  3 Africa     1962    52
#  4 Africa     1967    52
#  5 Africa     1972    52
#  6 Africa     1977    52
#  7 Africa     1982    52
#  8 Africa     1987    52
#  9 Africa     1992    52
# 10 Africa     1997    52
# … with 50 more rows
gapminder %>% janitor::tabyl(continent, year)
# continent 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 2002 2007
# Africa      52   52   52   52   52   52   52   52   52   52   52   52
# Americas    25   25   25   25   25   25   25   25   25   25   25   25
# Asia        33   33   33   33   33   33   33   33   33   33   33   33
# Europe      30   30   30   30   30   30   30   30   30   30   30   30
# Oceania      2    2    2    2    2    2    2    2    2    2    2    2
library(dplyr)
library(rlang)
library(rstatix)
library(skimr)
library(janitor)
sumr <- function(data, ...) {
vars <- rlang::enquos(...)
summary_fn <- function(.data, ...) {
rstatix::get_summary_stats(.data, ..., show = c("n", "mean", "sd", "min", "max", "median"))
}
result <- data %>%
group_modify(~ summary_fn(.x, !!!vars))
return(result)
}
tab <- function(data, ...) {
data %>%
janitor::tabyl(...) %>%
janitor::adorn_totals("row") %>%
janitor::adorn_pct_formatting(digits = 2)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment