Skip to content

Instantly share code, notes, and snippets.

@aravindhebbali
Last active September 24, 2017 08:50
Show Gist options
  • Save aravindhebbali/85fac536f563ae3fd8e2605fd56a7086 to your computer and use it in GitHub Desktop.
Save aravindhebbali/85fac536f563ae3fd8e2605fd56a7086 to your computer and use it in GitHub Desktop.
Working with Categorical Data in R
# install
install.packages('forcats')
install.packages('readr')
install.packages('dplyr')
install.packages('ggplot2')
# library
library(forcats)
library(readr)
library(dplyr)
library(ggplot2)
# import data
ecom <- readr::read_csv('https://raw.githubusercontent.com/rsquaredacademy/datasets/master/web.csv')
ecom
# tabulate referrers
ecom %>%
count(referrer)
# average page visits by referrers
refer_summary <- ecom %>%
group_by(referrer) %>%
summarise(
page = mean(n_pages),
tos = mean(duration),
n = n()
)
ggplot(refer_summary, aes(page, referrer)) + geom_point()
ggplot(refer_summary, aes(page, fct_reorder(referrer, page))) + geom_point()
# referrer frequency
ecom %>%
mutate(ref = referrer %>% fct_infreq()) %>%
ggplot(aes(ref)) +
geom_bar()
ecom %>%
mutate(ref = referrer %>% fct_infreq() %>% fct_rev()) %>%
ggplot(aes(ref)) +
geom_bar()
# import data
traffic <- readr::read_csv('https://raw.githubusercontent.com/rsquaredacademy/datasets/master/web_traffic.csv')
traffic
# tabulate referrer
traffic$traffics %>%
fct_count()
# collapse referrer categories
traffic2 <- fct_collapse(traffic$traffics,
social = c("facebook", "twitter", "instagram"),
search = c("google", "bing", "yahoo")
)
traffic2 %>% fct_count()
# lump infrequent referrers
traffic$traffics %>%
fct_lump() %>%
table()
# retain top 3 referrers
traffic$traffics %>%
fct_lump(n = 3) %>%
table()
# lump together referrers with < 10% traffic
traffic$traffics %>%
fct_lump(prop = 0.1) %>%
table()
# lump together referrers with < 15% traffic
traffic$traffics %>%
fct_lump(prop = 0.15) %>%
table()
# retain 3 referrers with lowest traffic
traffic$traffics %>%
fct_lump(n = -3) %>%
table()
# retain 3 referrers with < 10% traffic
traffic$traffics %>%
fct_lump(prop = -0.1) %>%
table()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment