Useful R functions for summary statistics similar to Stata, using {janitor}
and {rstatix}
.
library(dplyr)
library(rstatix)
library(janitor)
library(gapminder)
library(httr)
# Load the custom functions from a GitHub Gist
eval(parse(
text = content(
GET("https://gist.githubusercontent.com/takakishi/2d36ec78095087cc4ec711637442f22a/raw/sumstats.R"),
type = "text",
encoding = "UTF-8"
)
))
# Display a snippet of the Gapminder dataset
gapminder
# A tibble: 1,704 × 6
# country continent year lifeExp pop gdpPercap
# <fct> <fct> <int> <dbl> <int> <dbl>
# 1 Afghanistan Asia 1952 28.8 8425333 779.
# 2 Afghanistan Asia 1957 30.3 9240934 821.
# 3 Afghanistan Asia 1962 32.0 10267083 853.
# 4 Afghanistan Asia 1967 34.0 11537966 836.
# 5 Afghanistan Asia 1972 36.1 13079460 740.
# 6 Afghanistan Asia 1977 38.4 14880372 786.
# 7 Afghanistan Asia 1982 39.9 12881816 978.
# 8 Afghanistan Asia 1987 40.8 13867957 852.
# 9 Afghanistan Asia 1992 41.7 16317921 649.
# 10 Afghanistan Asia 1997 41.8 22227415 635.
The sumr()
function streamlines summary statistics by offering key measures without the need to constantly set the show argument, addressing the overload of information produced by rstatix::get_summary_stats()
.
# For the entire dataset
gapminder %>% rstatix::get_summary_stats()
# A tibble: 4 × 13
# variable n min max median q1 q3 iqr mad mean sd se ci
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 gdpPercap 1704 241. 113523. 3532. 1202. 9325. 8123. 4008. 7215. 9857. 2.39e+2 4.68e+2
# 2 lifeExp 1704 23.6 82.6 60.7 48.2 70.8 22.6 16.1 59.5 12.9 3.13e-1 6.14e-1
# 3 pop 1704 60011 1318683096 7023596. 2793664 19585222. 16791558. 7841474. 29601212. 106157897. 2.57e+6 5.04e+6
# 4 year 1704 1952 2007 1980. 1966. 1993. 27.5 22.2 1980. 17.3 4.18e-1 8.2 e-1
Generate basic summary statistics for the entire dataset, or filter by specific variables using sumr()
.
# For the entire dataset
gapminder %>% sumr()
# A tibble: 4 × 7
# variable n mean sd min max median
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 gdpPercap 1704 7215. 9857. 241. 113523. 3532.
# 2 lifeExp 1704 59.5 12.9 23.6 82.6 60.7
# 3 pop 1704 29601212. 106157897. 60011 1318683096 7023596.
# 4 year 1704 1980. 17.3 1952 2007 1980.
gapminder %>% sumr(lifeExp, pop)
# A tibble: 2 × 7
# variable n mean sd min max median
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 lifeExp 1704 59.5 12.9 23.6 82.6 60.7
# 2 pop 1704 29601212. 106157897. 60011 1318683096 7023596.
gapminder %>%
dplyr::group_by(continent) %>%
sumr(lifeExp, pop) %>%
dplyr::arrange(variable)
# A tibble: 10 × 8
# Groups: continent [5]
# continent variable n mean sd min max median
# <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Africa lifeExp 624 48.9 9.15 23.6 76.4 47.8
# 2 Americas lifeExp 300 64.7 9.35 37.6 80.7 67.0
# 3 Asia lifeExp 396 60.1 11.9 28.8 82.6 61.8
# 4 Europe lifeExp 360 71.9 5.43 43.6 81.8 72.2
# 5 Oceania lifeExp 24 74.3 3.80 69.1 81.2 73.7
# 6 Africa pop 624 9916003. 15490923. 60011 135031164 4579311
# 7 Americas pop 300 24504795. 50979430. 662850 301139947 6227510
# 8 Asia pop 396 77038722. 206885205. 120447 1318683096 14530830.
# 9 Europe pop 360 17169765. 20519438. 147962 82400996 8551125
# 10 Oceania pop 24 8874672. 6506342. 1994794 20434176 6403492.
Comparing dplyr::count()
with the custom tab()
function for frequency counts.
gapminder %>% dplyr::count(continent)
# A tibble: 5 × 2
# continent n
# <fct> <int>
# 1 Africa 624
# 2 Americas 300
# 3 Asia 396
# 4 Europe 360
# 5 Oceania 24
gapminder %>% tab(continent)
# continent n percent
# Africa 624 36.62%
# Americas 300 17.61%
# Asia 396 23.24%
# Europe 360 21.13%
# Oceania 24 1.41%
# Total 1704 100.00%
For more complex data exploration, janitor::tabyl()
simplifies cross-tabulation operations.
gapminder %>% dplyr::count(continent, year)
# A tibble: 60 × 3
# continent year n
# <fct> <int> <int>
# 1 Africa 1952 52
# 2 Africa 1957 52
# 3 Africa 1962 52
# 4 Africa 1967 52
# 5 Africa 1972 52
# 6 Africa 1977 52
# 7 Africa 1982 52
# 8 Africa 1987 52
# 9 Africa 1992 52
# 10 Africa 1997 52
# … with 50 more rows
gapminder %>% janitor::tabyl(continent, year)
# continent 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 2002 2007
# Africa 52 52 52 52 52 52 52 52 52 52 52 52
# Americas 25 25 25 25 25 25 25 25 25 25 25 25
# Asia 33 33 33 33 33 33 33 33 33 33 33 33
# Europe 30 30 30 30 30 30 30 30 30 30 30 30
# Oceania 2 2 2 2 2 2 2 2 2 2 2 2