romainx/lump.R

## lump.R
# Refs: https://forcats.tidyverse.org/reference/fct_lump.html

# A  test data frame
set.seed(2)
df <- tibble(cat = factor(sample(c("a", "b", "c", "d"), 10, replace = TRUE)),
       val = sample(1:10))
df %>% head()
# A tibble: 6 x 2
#   cat     val
#   <fct> <int>
# 1 a         9
# 2 c         2
# 3 b         3
# 4 b         1
# 5 d         8
# 6 d        10

# Lumping by number of values -----

# Count the number of values by categories
df %>%
    count(cat)
# A tibble: 4 x 2
#   cat       n
#   <fct> <int>
# 1 a         4
# 2 b         2
# 3 c         1
# 4 d         3

# Keep only the 2 most frequent categories, lumping together the remaining
# categories in the "Other" category
df %>%
    group_by(cat = forcats::fct_lump_n(cat, 2)) %>%
    count(cat)
# A tibble: 3 x 2
# Groups:   cat [3]
#   cat       n
#   <fct> <int>
# 1 a         4
# 2 d         3
# 3 Other     3
	# Refs: https://forcats.tidyverse.org/reference/fct_lump.html

	# A test data frame
	set.seed(2)
	df <- tibble(cat = factor(sample(c("a", "b", "c", "d"), 10, replace = TRUE)),
	val = sample(1:10))
	df %>% head()
	# A tibble: 6 x 2
	# cat val
	# <fct> <int>
	# 1 a 9
	# 2 c 2
	# 3 b 3
	# 4 b 1
	# 5 d 8
	# 6 d 10

	# Lumping by number of values -----

	# Count the number of values by categories
	df %>%
	count(cat)
	# A tibble: 4 x 2
	# cat n
	# <fct> <int>
	# 1 a 4
	# 2 b 2
	# 3 c 1
	# 4 d 3

	# Keep only the 2 most frequent categories, lumping together the remaining
	# categories in the "Other" category
	df %>%
	group_by(cat = forcats::fct_lump_n(cat, 2)) %>%
	count(cat)
	# A tibble: 3 x 2
	# Groups: cat [3]
	# cat n
	# <fct> <int>
	# 1 a 4
	# 2 d 3
	# 3 Other 3