Skip to content

Instantly share code, notes, and snippets.

@tmasjc
Created February 2, 2018 13:27
Show Gist options
  • Save tmasjc/e1c556e55f6f76dc62c460114b5fc140 to your computer and use it in GitHub Desktop.
Save tmasjc/e1c556e55f6f76dc62c460114b5fc140 to your computer and use it in GitHub Desktop.
Explore chinese outbound tourism data from UNWTO. #rstats #unwto
# China Outbound Tourism
library(readxl)
library(tidyr)
library(ggplot2)
library(broom)
# read data
cot <- read_excel("~/Downloads/outbound_tourism_china.xlsx", skip = 5)
# mod columns name
colnames(cot) <- c("country", "series", 1995:2016, "pct_chg_16_15")
# set ggplot2 palette
old <- theme_set(theme_minimal())
pal = scale_color_brewer(palette = "Set3")
# Which series has the most complete data? (TFR)
cot %>% group_by(series) %>% summarise(n = n()) %>% arrange(-n)
# Convert to tidy format
long <- cot %>%
filter(series == "TFR") %>%
select(-pct_chg_16_15, -series) %>%
gather(year, arrivals, -country)
long
## TODO: How To Select A Series?
# Which country left?
long %>%
group_by(country) %>%
summarise(sum = sum(arrivals, na.rm = TRUE)) %>%
arrange(-sum)
# Which countries are the most popular? ---------------------------------------
# Top 10 country in 2016
top10_2016 <- long %>% filter(year == 2016) %>% top_n(wt = arrivals, n = 10) %>% pull(country)
top10_2016
df <- long %>% filter(country %in% top10_2016)
df
df %>%
ggplot(aes(year, arrivals, col = country, group = country)) +
geom_line() +
theme(legend.position = "bottom") +
pal
df %>%
ggplot(aes(year, arrivals, col = country, group = country)) +
geom_line() +
scale_y_log10() +
theme(legend.position = "bottom") +
pal
df %>%
ggplot(aes(year, arrivals, col = country, group = country)) +
geom_smooth(se = FALSE) +
scale_y_log10() +
theme(legend.position = "bottom") +
pal
# Which countries grow the fastest? -----------------------------------------
long <- long %>%
group_by(country) %>%
arrange(year) %>%
mutate(growth = (arrivals - lag(arrivals)) / lag(arrivals))
long %>% arrange(country, year)
top20_growth <- long %>%
summarise(growth = mean(growth, na.rm = TRUE)) %>%
top_n(n = 20, wt = growth) %>%
pull(country)
top20_growth
long %>%
summarise(growth = mean(growth, na.rm = TRUE)) %>%
top_n(n = 20, wt = growth) %>%
ggplot(aes(country, growth)) +
geom_point(aes(col = country)) +
scale_color_discrete(guide = "none") +
theme(axis.text.x = element_text(angle = 60))
long %>%
filter(country %in% top20_growth) %>%
ggplot(aes(year, arrivals, col = country, group = country)) +
geom_line() +
scale_y_log10() +
theme(legend.position = "bottom")
long %>%
filter(country %in% top20_growth) %>%
ggplot(aes(year, arrivals, col = country, group = country)) +
geom_smooth(se = FALSE) +
scale_y_log10() +
theme(legend.position = "bottom")
# Measure goodness of fit -------------------------------------------------
long
mods <- long %>%
group_by(country) %>%
nest() %>%
# do lm() on every row
mutate(model = purrr::map(data, ~lm(log(arrivals) ~ as.integer(year), data = .)))
mods
# use broom here
mods_sum <- mods %>% unnest(model %>% purrr::map(glance))
mods_sum
# compare r2
p <- mods_sum %>%
ggplot(aes(r.squared, reorder(country, r.squared))) +
geom_point()
p
# use USA as a benchmark
usa.row <- mods_sum %>%
arrange(-r.squared) %>%
with(., which(country == "United States of America"))
p + geom_hline(yintercept = usa.row-1, col = "salmon", lty = 5)
@tmasjc
Copy link
Author

tmasjc commented Feb 2, 2018

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment