Last active
January 24, 2018 05:12
-
-
Save ikashnitsky/7c11a138e22b27d0e2db18bd9fd2035f to your computer and use it in GitHub Desktop.
Code for the Data acquisition in R series (posts 1-3) -- https://habrahabr.ru/post/345664/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################################ | |
# | |
# ikashnitsky.github.io 2017-12-27 | |
# Data acquisition in R - Parts 1/4, 2/4, 3/4 | |
# For Russian translation at https://habrahabr.ru/post/345664/ | |
# Ilya Kashnitsky, ilya.kashnitsky@gmail.com | |
# | |
################################################################################ | |
# load required packages | |
library(tidyverse) # data manipulation and viz | |
# built-in --------------------------------------------------------------------- | |
# Swiss Fertility and Socioeconomic Indicators (1888) Data. Let's check the difference in fertility based of rurality and domination of Catholic population. | |
swiss %>% | |
ggplot(aes(x = Agriculture, y = Fertility, | |
color = Catholic > 50))+ | |
geom_point()+ | |
stat_ellipse()+ | |
theme_minimal(base_family = "mono") | |
ggsave("swiss.png", width = 8, height = 5) | |
# gapminder -------------------------------------------------------------------- | |
library(gapminder) | |
gapminder %>% | |
ggplot(aes(x = year, y = lifeExp, | |
color = continent))+ | |
geom_jitter(size = 1, alpha = .2, width = .75)+ | |
stat_summary(geom = "path", fun.y = mean, size = 1)+ | |
theme_minimal(base_family = "mono") | |
ggsave("gapminder.png", width = 8, height = 5) | |
# URL -------------------------------------------------------------------------- | |
library(tidyverse) | |
galton <- read_csv("https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/HistData/Galton.csv") | |
galton %>% | |
ggplot(aes(x = father, y = height))+ | |
geom_point(alpha = .2)+ | |
stat_smooth(method = "lm")+ | |
theme_minimal(base_family = "mono") | |
ggsave("galton.png", width = 8, height = 5) | |
# UNZIP ------------------------------------------------------------------------ | |
# create a directory for the unzipped data | |
ifelse(!dir.exists("unzipped"), dir.create("unzipped"), "Directory already exists") | |
# specify the URL of the archive | |
url_zip <- "http://www.nyc.gov/html/nypd/downloads/zip/analysis_and_planning/citywide_historical_crime_data_archive.zip" | |
# download, unzip and read data | |
f <- tempfile() | |
download.file(url_zip, destfile = f) | |
unzip(f, exdir = "unzipped/.") | |
# if we want to keep the .zip file | |
path_unzip <- "unzipped/data_archive.zip" | |
ifelse(!file.exists(path_unzip), | |
download.file(url_zip, path_unzip, mode="wb"), | |
'file alredy exists') | |
unzip(path_unzip, exdir = "unzipped/.") | |
library(readxl) | |
murder <- read_xls("unzipped/Web Data 2010-2011/Seven Major Felony Offenses 2000 - 2011.xls", | |
sheet = 1, range = "A5:M13") %>% | |
filter(OFFENSE %>% substr(1, 6) == "MURDER") %>% | |
gather("year", "value", 2:13) %>% | |
mutate(year = year %>% as.numeric()) | |
# plot | |
murder %>% | |
ggplot(aes(year, value))+ | |
geom_point()+ | |
stat_smooth(method = "lm")+ | |
theme_minimal(base_family = "mono")+ | |
labs(title = "Murders in New York") | |
ggsave("new-york.png", width = 8, height = 5) | |
# Figshare --------------------------------------------------------------------- | |
library(rfigshare) | |
# find the dataset | |
# fs_search("ice hockey players") # not working | |
url <- fs_download(article_id = "3394735") | |
hockey <- read_csv(url) | |
hockey %>% | |
ggplot(aes(x = year, y = height))+ | |
geom_jitter(size = 2, color = "#35978f", alpha = .1, width = .25)+ | |
stat_smooth(method = "lm", size = 1)+ | |
ylab("height, cm")+ | |
xlab("year of competition")+ | |
scale_x_continuous(breaks = seq(2005, 2015, 5), labels = seq(2005, 2015, 5))+ | |
theme_minimal(base_family = "mono") | |
ggsave("ice-hockey.png", width = 8, height = 5) | |
# Eurostat --------------------------------------------------------------------- | |
library(tidyverse) | |
library(eurostat) | |
library(lubridate) | |
library(viridis) | |
search_eurostat("life expectancy") | |
# download the selected dataset | |
e0 <- get_eurostat("demo_mlexpec") | |
e0 %>% | |
filter(! sex == "T", | |
age == "Y65", | |
geo %in% c("DE", "FR", "IT", "RU", "ES", "UK")) %>% | |
ggplot(aes(x = time %>% year(), y = values, color = sex))+ | |
geom_path()+ | |
facet_wrap(~ geo, ncol = 3)+ | |
labs(y = "Life expectancy at age 65", x = NULL)+ | |
theme_minimal(base_family = "mono") | |
ggsave("eurostat.png", width = 8, height = 5) | |
# World Bank ------------------------------------------------------------------- | |
library(tidyverse) | |
library(wbstats) | |
# search for a dataset of interest | |
wbsearch("fertility") %>% View | |
# fetch the selected dataset | |
df_wb <- wb(indicator = "SH.MMR.RISK.ZS", startdate = 2000, enddate = 2015) | |
# have look at the data for one year | |
df_wb %>% filter(date == 2015) %>% View | |
df_wb %>% | |
filter(iso2c %in% c("V4", "V1", "1W")) %>% | |
ggplot(aes(x = date %>% as.numeric(), y = value, color = country))+ | |
geom_path(size = 1)+ | |
scale_color_brewer(NULL, palette = "Dark2")+ | |
labs(x = NULL, y = NULL, title = "Lifetime risk of maternal death (%)")+ | |
theme_minimal(base_family = "mono")+ | |
theme(panel.grid.minor = element_blank(), | |
legend.position = c(.8, .9)) | |
ggsave("worldbank.png", width = 8, height = 5) | |
# OECD ------------------------------------------------------------------------- | |
library(tidyverse) | |
library(viridis) | |
library(OECD) | |
search_dataset("unemployment") %>% View | |
df_oecd <- get_dataset("AVD_DUR") | |
names(df_oecd) <- names(df_oecd) %>% tolower() | |
df_oecd %>% | |
filter(country %in% c("EU16", "EU28", "USA"), sex == "MEN", ! age == "1524") %>% | |
ggplot(aes(obstime, age, fill = obsvalue))+ | |
geom_tile()+ | |
scale_fill_viridis("Months", option = "B")+ | |
scale_x_discrete(breaks = seq(1970, 2015, 5) %>% paste)+ | |
facet_wrap(~ country, ncol = 1)+ | |
labs(x = NULL, y = "Age groups", | |
title = "Average duration of unemployment in months, males")+ | |
theme_minimal(base_family = "mono") | |
ggsave("oecd.png", width = 8, height = 5) | |
# WID -------------------------------------------------------------------------- | |
library(tidyverse) | |
#install.packages("devtools") | |
devtools::install_github("WIDworld/wid-r-tool") | |
library(wid) | |
?wid_series_type | |
?wid_concepts | |
df_wid <- download_wid( | |
indicators = "shweal", # Shares of personal wealth | |
areas = c("FR", "GB"), # In France an Italy | |
perc = c("p90p100", "p99p100") # Top 1% and top 10% | |
) | |
df_wid %>% | |
ggplot(aes(x = year, y = value, color = country)) + | |
geom_path()+ | |
labs(title = "Top 1% and top 10% personal wealth shares in France and Great Britain", | |
y = "top share")+ | |
facet_wrap(~ percentile)+ | |
theme_minimal(base_family = "mono") | |
ggsave("wid.png", width = 8, height = 5) | |
# HMD -------------------------------------------------------------------------- | |
# load required packages | |
library(HMDHFDplus) | |
library(tidyverse) | |
library(purrr) | |
# help function to list the available countries | |
country <- getHMDcountries() | |
# remove optional populations | |
opt_pop <- c("FRACNP", "DEUTE", "DEUTW", "GBRCENW", "GBR_NP") | |
country <- country[!country %in% opt_pop] | |
# temporary function to download HMD data for a simgle county (dot = input) | |
tempf_get_hmd <- . %>% readHMDweb("Exposures_1x1", ik_user_hmd, ik_pass_hmd) | |
# download the data iteratively for all countries using purrr::map() | |
exposures <- country %>% map(tempf_get_hmd) | |
# data transformation to apply to each county dataframe | |
tempf_trans_data <- . %>% | |
select(Year, Age, Female, Male) %>% | |
filter(Year %in% 2012) %>% | |
select(-Year) %>% | |
transmute(age = Age, ratio = Male / Female * 100) | |
# perform transformation | |
df_hmd <- exposures %>% | |
map(tempf_trans_data) %>% | |
bind_rows(.id = "country") | |
# summarize all ages older than 90 (too jerky) | |
df_hmd_90 <- df_hmd %>% | |
filter(age %in% 90:110) %>% | |
group_by(country) %>% | |
summarise(ratio = ratio %>% mean(na.rm = T)) %>% | |
ungroup() %>% | |
transmute(country, age = 90, ratio) | |
# insert summarized 90+ | |
df_hmd_fin <- bind_rows(df_hmd %>% filter(!age %in% 90:110), df_hmd_90) | |
# finaly - plot | |
df_hmd_fin %>% | |
ggplot(aes(age, ratio, color = country, group = country))+ | |
geom_hline(yintercept = 100, color = "grey50", size = 1)+ | |
geom_line(size = 1)+ | |
scale_y_continuous(limits = c(0, 120), | |
expand = c(0, 0), | |
breaks = seq(0, 120, 20))+ | |
scale_x_continuous(limits = c(0, 90), | |
expand = c(0, 0), | |
breaks = seq(0, 80, 20))+ | |
facet_wrap(~country, ncol = 6)+ | |
theme_minimal(base_family = "mono", base_size = 15)+ | |
theme(legend.position = "none", | |
panel.border = element_rect(size = .5, fill = NA, | |
color = "grey50"))+ | |
labs(x = "Age", | |
y = "Sex ratio, males per 100 females", | |
title = "Sex ratio in all countries from Human Mortality Database", | |
subtitle = "HMD 2012, via HMDHFDplus by @timriffe1", | |
caption = "ikashnitsky.github.io") | |
ggsave("hmd.png", width = 10, height = 12) | |
# wpp 2015 --------------------------------------------------------------------- | |
library(wpp2015) | |
library(tidyverse) | |
library(ggridges) | |
library(viridis) | |
# get the UN country names | |
data(UNlocations) | |
countries <- UNlocations %>% pull(name) %>% paste | |
# data on male life expectancy at birth | |
data(e0M) | |
e0M %>% | |
filter(country %in% countries) %>% | |
select(-last.observed) %>% | |
gather(period, value, 3:15) %>% | |
ggplot(aes(x = value, y = period %>% fct_rev()))+ | |
geom_density_ridges(aes(fill = period))+ | |
scale_fill_viridis(discrete = T, option = "B", direction = -1, | |
begin = .1, end = .9)+ | |
labs(x = "Male life expectancy at birth", | |
y = "Period", | |
title = "Global convergence in male life expectancy at birth since 1950", | |
subtitle = "UNPD World Population Prospects 2015 Revision, via wpp2015", | |
caption = "ikashnitsky.github.io")+ | |
theme_minimal(base_family = "mono", base_size = 14)+ | |
theme(legend.position = "none") | |
ggsave("wpp2015.png", width = 10, height = 7) | |
# ESS (Jorge) ------------------------------------------------------------- | |
library(ess) | |
library(tidyverse) | |
# help gunction to see the available countries | |
show_countries() | |
# check the available rounds for a selected country | |
show_country_rounds("Netherlands") | |
# get the full dataset of the last (8) round | |
df_ess <- ess_rounds(8, your_email = ik_email) | |
# select a variable and calculate mean value | |
df_ess_select <- df_ess %>% | |
bind_rows() %>% | |
select(idno, cntry, trstplc) %>% | |
group_by(cntry) %>% | |
mutate(avg = trstplc %>% mean(na.rm = T)) %>% | |
ungroup() %>% | |
mutate(cntry = cntry %>% as_factor() %>% fct_reorder(avg)) | |
df_ess_select %>% | |
ggplot(aes(trstplc, fill = avg))+ | |
geom_histogram()+ | |
scale_x_continuous(limits = c(0, 11), breaks = seq(2, 10, 2))+ | |
scale_fill_gradient("Average\ntrust\nscore", | |
low = "black", high = "aquamarine")+ | |
facet_wrap(~cntry, ncol = 6)+ | |
theme_minimal(base_family = "mono")+ | |
labs(x = "Trust score [0 -- 10]", | |
y = "# of respondents", | |
title = "Trust in police", | |
subtitle = "ESS wave 8 2017, via ess by @cimentadaj", | |
caption = "ikashnitsky.github.io") | |
ggsave("ess.png", width = 8, height = 6) | |
# American Census data ---------------------------------------------------- | |
library(tidycensus) | |
library(tidyverse) | |
library(viridis) | |
library(janitor) | |
library(sf) | |
# to use geom_sf we need the latest development version of ggplot2 | |
devtools::install_github("tidyverse/ggplot2", "develop") | |
library(ggplot2) | |
# you need a personal API key, available free at | |
# https://api.census.gov/data/key_signup.html | |
# normally, this key is to be stored in .Renviron | |
# see state and county codes and names | |
fips_codes %>% View | |
# the available variables | |
load_variables(year = 2015, dataset = "acs5") %>% View | |
# data on median age of population in Chicago | |
df_acs <- get_acs( | |
geography = "tract", | |
county = "Cook County", | |
state = "IL", | |
variables = "B01002_001E", | |
year = 2015, | |
key = ik_api_acs, | |
geometry = TRUE | |
) %>% clean_names() | |
# map the data | |
df_acs %>% | |
ggplot()+ | |
geom_sf(aes(fill = estimate %>% | |
cut(breaks = seq(20, 60, 10))), | |
color = NA)+ | |
scale_fill_viridis_d("Median age", begin = .4)+ | |
coord_sf(datum = NA)+ | |
theme_void(base_family = "mono")+ | |
theme(legend.position = c(.15, .15))+ | |
labs(title = "Median age of population in Chicago\nby census tracts\n", | |
subtitle = "ACS 2015, via tidycensus by @kyle_e_walker", | |
caption = "ikashnitsky.github.io", | |
x = NULL, y = NULL) | |
ggsave("tidycensus.png", width = 6, height = 6) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment