Last active
July 10, 2020 10:07
-
-
Save alekrutkowski/275c8c94292b386cc68d43f7b48ee1de to your computer and use it in GitHub Desktop.
Data imports from Eurostat to ECFIN.B2's Tax Assessment Framework
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(magrittr) | |
# requires also `eurodata` package | |
# https://github.com/alekrutkowski/eurodata | |
# Helpers ----------------------------------------------------------------- | |
msg <- function(obj, txt='\n') { | |
message(txt) | |
obj | |
} | |
`%except%` <- setdiff | |
factorToInteger <- function(fct) | |
fct %>% | |
as.character %>% | |
as.integer | |
dimsWithDimNames <- function(dt) { | |
cols <- | |
colnames(dt) %except% c('value_','geo') | |
dt[, (cols) := lapply(cols, | |
. %>% | |
paste0('=',dt[[.]]))] | |
} | |
obsSubset <- relevant_countries <- function(geo, x, w, country.group.vec) { | |
geo %in% country.group.vec & | |
!is.na(x) & | |
!is.na(w) | |
} | |
weighted <- function(geo, FUN, val.vec, weight.vec, country.group.vec) { | |
x <- val.vec | |
w <- weight.vec | |
obs.subset <- obsSubset(geo,x,w,country.group.vec) | |
FUN(x[obs.subset], | |
w[obs.subset]) | |
} | |
weightedAvg <- function(geo, val.vec, weight.vec, country.group.vec) | |
weighted(geo, FUN=stats::weighted.mean, val.vec, weight.vec, country.group.vec) | |
weightedStdDev <- function(geo, val.vec, weight.vec, country.group.vec) | |
sqrt(weighted(geo, FUN=Hmisc::wtd.var, val.vec, weight.vec, country.group.vec)) | |
LAFplus <- function(geo, val.vec, weight.vec, country.group.vec) | |
weightedAvg(geo, val.vec, weight.vec, country.group.vec) + | |
0.4*weightedStdDev(geo, val.vec, weight.vec, country.group.vec) | |
LAFminus <- function(geo, val.vec, weight.vec, country.group.vec) | |
weightedAvg(geo, val.vec, weight.vec, country.group.vec) - | |
0.4*weightedStdDev(geo, val.vec, weight.vec, country.group.vec) | |
# Main -------------------------------------------------------------------- | |
codes <- | |
c('lfsa_ergaed' # Employment rates by sex, age and educational attainment level (%) | |
,'lfsa_urgaed' # Unemployment rates by sex, age and educational attainment level (%) | |
,'lfsa_ewhuis' # Average number of usual weekly hours of work in main job, by sex, professional status, full-time/part-time and occupation (hours) | |
,'nama_10_gdp' # GDP and main components (output, expenditure and income) | |
) | |
EU.countries <- | |
c('BE' | |
,'BG' | |
,'CZ' | |
,'DK' | |
,'DE' | |
,'EE' | |
,'IE' | |
,'EL' | |
,'ES' | |
,'FR' | |
,'HR' | |
,'IT' | |
,'CY' | |
,'LV' | |
,'LT' | |
,'LU' | |
,'HU' | |
,'MT' | |
,'NL' | |
,'AT' | |
,'PL' | |
,'PT' | |
,'RO' | |
,'SI' | |
,'SK' | |
,'FI' | |
,'SE' | |
,'UK' | |
) | |
EA.countries <- | |
c('BE' | |
,'DE' | |
,'EE' | |
,'IE' | |
,'EL' | |
,'ES' | |
,'FR' | |
,'IT' | |
,'CY' | |
,'LV' | |
,'LT' | |
,'LU' | |
,'MT' | |
,'NL' | |
,'AT' | |
,'PT' | |
,'SI' | |
,'SK' | |
,'FI' | |
) | |
dta.list <- | |
codes %>% | |
sapply(. %>% | |
msg %>% | |
eurodata::importData() %>% | |
data.table::as.data.table() %>% | |
.[, time := time %>% factorToInteger] %>% | |
.[, flags_ := NULL]) | |
dta.list.modif <- | |
dta.list %>% | |
sapply(. %>% {NULL}) | |
transformsFor__lfsa_ergaed__lfsa_urgaed <- | |
function(.list, | |
.names=c('lfsa_ergaed' # Employment rates by sex, age and educational attainment level (%) | |
,'lfsa_urgaed')) | |
sapply(.names, | |
function(name) | |
if (name %in% .names) | |
.list[[name]] %>% | |
.[time == max(time)] %>% | |
.[geo %in% EU.countries] %>% | |
.[age == 'Y20-64'] %>% | |
.[isced11 %in% c('TOTAL','ED0-2')] %>% | |
.[unit == 'PC'], | |
simplify=FALSE) | |
dta.list.modif <- | |
transformsFor__lfsa_ergaed__lfsa_urgaed(dta.list) | |
dta.list.modif$lfsa_ewhuis <- | |
dta.list$lfsa_ewhuis %>% | |
.[time == max(time)] %>% | |
.[geo %in% EU.countries] %>% | |
.[sex == 'F'] %>% | |
.[isco08 %in% c('TOTAL')] %>% | |
.[worktime == 'TOTAL'] %>% | |
.[wstatus == 'EMP'] %>% | |
.[unit == 'HR'] | |
dta.list.modif$nama_10_gdp <- | |
dta.list$nama_10_gdp %>% | |
.[na_item == 'B1GQ'] %>% | |
.[time == max(time)] %>% | |
.[unit == 'CP_MEUR'] | |
dta.list.modif.reshaped <- | |
names(dta.list.modif) %>% | |
sapply(function(name) | |
dta.list.modif[[name]] %>% | |
dimsWithDimNames %>% | |
data.table::dcast(geo ~ ..., | |
value.var='value_', | |
sep=', ') %>% | |
data.table::setnames(colnames(.) %except% 'geo', | |
paste0(name,': ',colnames(.) %except% 'geo'))) | |
dta.all <- | |
Reduce(function(dt1,dt2) | |
merge(dt1, dt2, by='geo'), | |
dta.list.modif.reshaped) %>% | |
.[, gdp := .[[colnames(.) %>% grep('nama_10_gdp',.,value=TRUE)]]] # shorter alias | |
indic.colnames <- | |
colnames(dta.all) %>% | |
grep('nama|gdp|geo',.,value=TRUE, invert=TRUE) | |
summary.stats <- | |
dta.all %>% | |
data.table::melt(id.vars='geo', | |
measure.vars=indic.colnames) %>% | |
merge(dta.all[, c('geo','gdp')], by='geo') %>% | |
.[, | |
.(`EU weighted average` = weightedAvg(geo, value, gdp, EU.countries) | |
,`Euro Area weighted average` = weightedAvg(geo, value, gdp, EA.countries) | |
,`Relatively high` = LAFplus(geo, value, gdp, EU.countries) | |
,`Relatively low` = LAFminus(geo, value, gdp, EU.countries)) | |
,by='variable'] # across geo | |
col.names <- | |
summary.stats$variable %>% | |
as.character | |
stats.names <- | |
colnames(summary.stats) %except% 'variable' | |
final.table <- | |
dta.all %>% | |
.[, gdp := NULL] %>% | |
.[order(match(geo, EU.countries))] %>% | |
rbind(summary.stats[, -'variable'] %>% | |
data.table::transpose() %>% | |
data.table::setnames(colnames(.), | |
col.names) %>% | |
.[, geo := stats.names], | |
fill=TRUE) %T>% | |
openxlsx::write.xlsx('Estat data for TAF.xlsx') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment