Skip to content

Instantly share code, notes, and snippets.

@ikashnitsky
Last active January 24, 2018 05:12
Show Gist options
  • Save ikashnitsky/7c11a138e22b27d0e2db18bd9fd2035f to your computer and use it in GitHub Desktop.
Save ikashnitsky/7c11a138e22b27d0e2db18bd9fd2035f to your computer and use it in GitHub Desktop.
Code for the Data acquisition in R series (posts 1-3) -- https://habrahabr.ru/post/345664/
################################################################################
#
# ikashnitsky.github.io 2017-12-27
# Data acquisition in R - Parts 1/4, 2/4, 3/4
# For Russian translation at https://habrahabr.ru/post/345664/
# Ilya Kashnitsky, ilya.kashnitsky@gmail.com
#
################################################################################
# load required packages
library(tidyverse) # data manipulation and viz
# built-in ---------------------------------------------------------------------
# Swiss Fertility and Socioeconomic Indicators (1888) Data. Let's check the difference in fertility based of rurality and domination of Catholic population.
swiss %>%
ggplot(aes(x = Agriculture, y = Fertility,
color = Catholic > 50))+
geom_point()+
stat_ellipse()+
theme_minimal(base_family = "mono")
ggsave("swiss.png", width = 8, height = 5)
# gapminder --------------------------------------------------------------------
library(gapminder)
gapminder %>%
ggplot(aes(x = year, y = lifeExp,
color = continent))+
geom_jitter(size = 1, alpha = .2, width = .75)+
stat_summary(geom = "path", fun.y = mean, size = 1)+
theme_minimal(base_family = "mono")
ggsave("gapminder.png", width = 8, height = 5)
# URL --------------------------------------------------------------------------
library(tidyverse)
galton <- read_csv("https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/HistData/Galton.csv")
galton %>%
ggplot(aes(x = father, y = height))+
geom_point(alpha = .2)+
stat_smooth(method = "lm")+
theme_minimal(base_family = "mono")
ggsave("galton.png", width = 8, height = 5)
# UNZIP ------------------------------------------------------------------------
# create a directory for the unzipped data
ifelse(!dir.exists("unzipped"), dir.create("unzipped"), "Directory already exists")
# specify the URL of the archive
url_zip <- "http://www.nyc.gov/html/nypd/downloads/zip/analysis_and_planning/citywide_historical_crime_data_archive.zip"
# download, unzip and read data
f <- tempfile()
download.file(url_zip, destfile = f)
unzip(f, exdir = "unzipped/.")
# if we want to keep the .zip file
path_unzip <- "unzipped/data_archive.zip"
ifelse(!file.exists(path_unzip),
download.file(url_zip, path_unzip, mode="wb"),
'file alredy exists')
unzip(path_unzip, exdir = "unzipped/.")
library(readxl)
murder <- read_xls("unzipped/Web Data 2010-2011/Seven Major Felony Offenses 2000 - 2011.xls",
sheet = 1, range = "A5:M13") %>%
filter(OFFENSE %>% substr(1, 6) == "MURDER") %>%
gather("year", "value", 2:13) %>%
mutate(year = year %>% as.numeric())
# plot
murder %>%
ggplot(aes(year, value))+
geom_point()+
stat_smooth(method = "lm")+
theme_minimal(base_family = "mono")+
labs(title = "Murders in New York")
ggsave("new-york.png", width = 8, height = 5)
# Figshare ---------------------------------------------------------------------
library(rfigshare)
# find the dataset
# fs_search("ice hockey players") # not working
url <- fs_download(article_id = "3394735")
hockey <- read_csv(url)
hockey %>%
ggplot(aes(x = year, y = height))+
geom_jitter(size = 2, color = "#35978f", alpha = .1, width = .25)+
stat_smooth(method = "lm", size = 1)+
ylab("height, cm")+
xlab("year of competition")+
scale_x_continuous(breaks = seq(2005, 2015, 5), labels = seq(2005, 2015, 5))+
theme_minimal(base_family = "mono")
ggsave("ice-hockey.png", width = 8, height = 5)
# Eurostat ---------------------------------------------------------------------
library(tidyverse)
library(eurostat)
library(lubridate)
library(viridis)
search_eurostat("life expectancy")
# download the selected dataset
e0 <- get_eurostat("demo_mlexpec")
e0 %>%
filter(! sex == "T",
age == "Y65",
geo %in% c("DE", "FR", "IT", "RU", "ES", "UK")) %>%
ggplot(aes(x = time %>% year(), y = values, color = sex))+
geom_path()+
facet_wrap(~ geo, ncol = 3)+
labs(y = "Life expectancy at age 65", x = NULL)+
theme_minimal(base_family = "mono")
ggsave("eurostat.png", width = 8, height = 5)
# World Bank -------------------------------------------------------------------
library(tidyverse)
library(wbstats)
# search for a dataset of interest
wbsearch("fertility") %>% View
# fetch the selected dataset
df_wb <- wb(indicator = "SH.MMR.RISK.ZS", startdate = 2000, enddate = 2015)
# have look at the data for one year
df_wb %>% filter(date == 2015) %>% View
df_wb %>%
filter(iso2c %in% c("V4", "V1", "1W")) %>%
ggplot(aes(x = date %>% as.numeric(), y = value, color = country))+
geom_path(size = 1)+
scale_color_brewer(NULL, palette = "Dark2")+
labs(x = NULL, y = NULL, title = "Lifetime risk of maternal death (%)")+
theme_minimal(base_family = "mono")+
theme(panel.grid.minor = element_blank(),
legend.position = c(.8, .9))
ggsave("worldbank.png", width = 8, height = 5)
# OECD -------------------------------------------------------------------------
library(tidyverse)
library(viridis)
library(OECD)
search_dataset("unemployment") %>% View
df_oecd <- get_dataset("AVD_DUR")
names(df_oecd) <- names(df_oecd) %>% tolower()
df_oecd %>%
filter(country %in% c("EU16", "EU28", "USA"), sex == "MEN", ! age == "1524") %>%
ggplot(aes(obstime, age, fill = obsvalue))+
geom_tile()+
scale_fill_viridis("Months", option = "B")+
scale_x_discrete(breaks = seq(1970, 2015, 5) %>% paste)+
facet_wrap(~ country, ncol = 1)+
labs(x = NULL, y = "Age groups",
title = "Average duration of unemployment in months, males")+
theme_minimal(base_family = "mono")
ggsave("oecd.png", width = 8, height = 5)
# WID --------------------------------------------------------------------------
library(tidyverse)
#install.packages("devtools")
devtools::install_github("WIDworld/wid-r-tool")
library(wid)
?wid_series_type
?wid_concepts
df_wid <- download_wid(
indicators = "shweal", # Shares of personal wealth
areas = c("FR", "GB"), # In France an Italy
perc = c("p90p100", "p99p100") # Top 1% and top 10%
)
df_wid %>%
ggplot(aes(x = year, y = value, color = country)) +
geom_path()+
labs(title = "Top 1% and top 10% personal wealth shares in France and Great Britain",
y = "top share")+
facet_wrap(~ percentile)+
theme_minimal(base_family = "mono")
ggsave("wid.png", width = 8, height = 5)
# HMD --------------------------------------------------------------------------
# load required packages
library(HMDHFDplus)
library(tidyverse)
library(purrr)
# help function to list the available countries
country <- getHMDcountries()
# remove optional populations
opt_pop <- c("FRACNP", "DEUTE", "DEUTW", "GBRCENW", "GBR_NP")
country <- country[!country %in% opt_pop]
# temporary function to download HMD data for a simgle county (dot = input)
tempf_get_hmd <- . %>% readHMDweb("Exposures_1x1", ik_user_hmd, ik_pass_hmd)
# download the data iteratively for all countries using purrr::map()
exposures <- country %>% map(tempf_get_hmd)
# data transformation to apply to each county dataframe
tempf_trans_data <- . %>%
select(Year, Age, Female, Male) %>%
filter(Year %in% 2012) %>%
select(-Year) %>%
transmute(age = Age, ratio = Male / Female * 100)
# perform transformation
df_hmd <- exposures %>%
map(tempf_trans_data) %>%
bind_rows(.id = "country")
# summarize all ages older than 90 (too jerky)
df_hmd_90 <- df_hmd %>%
filter(age %in% 90:110) %>%
group_by(country) %>%
summarise(ratio = ratio %>% mean(na.rm = T)) %>%
ungroup() %>%
transmute(country, age = 90, ratio)
# insert summarized 90+
df_hmd_fin <- bind_rows(df_hmd %>% filter(!age %in% 90:110), df_hmd_90)
# finaly - plot
df_hmd_fin %>%
ggplot(aes(age, ratio, color = country, group = country))+
geom_hline(yintercept = 100, color = "grey50", size = 1)+
geom_line(size = 1)+
scale_y_continuous(limits = c(0, 120),
expand = c(0, 0),
breaks = seq(0, 120, 20))+
scale_x_continuous(limits = c(0, 90),
expand = c(0, 0),
breaks = seq(0, 80, 20))+
facet_wrap(~country, ncol = 6)+
theme_minimal(base_family = "mono", base_size = 15)+
theme(legend.position = "none",
panel.border = element_rect(size = .5, fill = NA,
color = "grey50"))+
labs(x = "Age",
y = "Sex ratio, males per 100 females",
title = "Sex ratio in all countries from Human Mortality Database",
subtitle = "HMD 2012, via HMDHFDplus by @timriffe1",
caption = "ikashnitsky.github.io")
ggsave("hmd.png", width = 10, height = 12)
# wpp 2015 ---------------------------------------------------------------------
library(wpp2015)
library(tidyverse)
library(ggridges)
library(viridis)
# get the UN country names
data(UNlocations)
countries <- UNlocations %>% pull(name) %>% paste
# data on male life expectancy at birth
data(e0M)
e0M %>%
filter(country %in% countries) %>%
select(-last.observed) %>%
gather(period, value, 3:15) %>%
ggplot(aes(x = value, y = period %>% fct_rev()))+
geom_density_ridges(aes(fill = period))+
scale_fill_viridis(discrete = T, option = "B", direction = -1,
begin = .1, end = .9)+
labs(x = "Male life expectancy at birth",
y = "Period",
title = "Global convergence in male life expectancy at birth since 1950",
subtitle = "UNPD World Population Prospects 2015 Revision, via wpp2015",
caption = "ikashnitsky.github.io")+
theme_minimal(base_family = "mono", base_size = 14)+
theme(legend.position = "none")
ggsave("wpp2015.png", width = 10, height = 7)
# ESS (Jorge) -------------------------------------------------------------
library(ess)
library(tidyverse)
# help gunction to see the available countries
show_countries()
# check the available rounds for a selected country
show_country_rounds("Netherlands")
# get the full dataset of the last (8) round
df_ess <- ess_rounds(8, your_email = ik_email)
# select a variable and calculate mean value
df_ess_select <- df_ess %>%
bind_rows() %>%
select(idno, cntry, trstplc) %>%
group_by(cntry) %>%
mutate(avg = trstplc %>% mean(na.rm = T)) %>%
ungroup() %>%
mutate(cntry = cntry %>% as_factor() %>% fct_reorder(avg))
df_ess_select %>%
ggplot(aes(trstplc, fill = avg))+
geom_histogram()+
scale_x_continuous(limits = c(0, 11), breaks = seq(2, 10, 2))+
scale_fill_gradient("Average\ntrust\nscore",
low = "black", high = "aquamarine")+
facet_wrap(~cntry, ncol = 6)+
theme_minimal(base_family = "mono")+
labs(x = "Trust score [0 -- 10]",
y = "# of respondents",
title = "Trust in police",
subtitle = "ESS wave 8 2017, via ess by @cimentadaj",
caption = "ikashnitsky.github.io")
ggsave("ess.png", width = 8, height = 6)
# American Census data ----------------------------------------------------
library(tidycensus)
library(tidyverse)
library(viridis)
library(janitor)
library(sf)
# to use geom_sf we need the latest development version of ggplot2
devtools::install_github("tidyverse/ggplot2", "develop")
library(ggplot2)
# you need a personal API key, available free at
# https://api.census.gov/data/key_signup.html
# normally, this key is to be stored in .Renviron
# see state and county codes and names
fips_codes %>% View
# the available variables
load_variables(year = 2015, dataset = "acs5") %>% View
# data on median age of population in Chicago
df_acs <- get_acs(
geography = "tract",
county = "Cook County",
state = "IL",
variables = "B01002_001E",
year = 2015,
key = ik_api_acs,
geometry = TRUE
) %>% clean_names()
# map the data
df_acs %>%
ggplot()+
geom_sf(aes(fill = estimate %>%
cut(breaks = seq(20, 60, 10))),
color = NA)+
scale_fill_viridis_d("Median age", begin = .4)+
coord_sf(datum = NA)+
theme_void(base_family = "mono")+
theme(legend.position = c(.15, .15))+
labs(title = "Median age of population in Chicago\nby census tracts\n",
subtitle = "ACS 2015, via tidycensus by @kyle_e_walker",
caption = "ikashnitsky.github.io",
x = NULL, y = NULL)
ggsave("tidycensus.png", width = 6, height = 6)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment