Skip to content

Instantly share code, notes, and snippets.

@alekrutkowski
Last active July 10, 2020 10:07
Show Gist options
  • Save alekrutkowski/275c8c94292b386cc68d43f7b48ee1de to your computer and use it in GitHub Desktop.
Save alekrutkowski/275c8c94292b386cc68d43f7b48ee1de to your computer and use it in GitHub Desktop.
Data imports from Eurostat to ECFIN.B2's Tax Assessment Framework
library(magrittr)
# requires also `eurodata` package
# https://github.com/alekrutkowski/eurodata
# Helpers -----------------------------------------------------------------
msg <- function(obj, txt='\n') {
message(txt)
obj
}
`%except%` <- setdiff
factorToInteger <- function(fct)
fct %>%
as.character %>%
as.integer
dimsWithDimNames <- function(dt) {
cols <-
colnames(dt) %except% c('value_','geo')
dt[, (cols) := lapply(cols,
. %>%
paste0('=',dt[[.]]))]
}
obsSubset <- relevant_countries <- function(geo, x, w, country.group.vec) {
geo %in% country.group.vec &
!is.na(x) &
!is.na(w)
}
weighted <- function(geo, FUN, val.vec, weight.vec, country.group.vec) {
x <- val.vec
w <- weight.vec
obs.subset <- obsSubset(geo,x,w,country.group.vec)
FUN(x[obs.subset],
w[obs.subset])
}
weightedAvg <- function(geo, val.vec, weight.vec, country.group.vec)
weighted(geo, FUN=stats::weighted.mean, val.vec, weight.vec, country.group.vec)
weightedStdDev <- function(geo, val.vec, weight.vec, country.group.vec)
sqrt(weighted(geo, FUN=Hmisc::wtd.var, val.vec, weight.vec, country.group.vec))
LAFplus <- function(geo, val.vec, weight.vec, country.group.vec)
weightedAvg(geo, val.vec, weight.vec, country.group.vec) +
0.4*weightedStdDev(geo, val.vec, weight.vec, country.group.vec)
LAFminus <- function(geo, val.vec, weight.vec, country.group.vec)
weightedAvg(geo, val.vec, weight.vec, country.group.vec) -
0.4*weightedStdDev(geo, val.vec, weight.vec, country.group.vec)
# Main --------------------------------------------------------------------
codes <-
c('lfsa_ergaed' # Employment rates by sex, age and educational attainment level (%)
,'lfsa_urgaed' # Unemployment rates by sex, age and educational attainment level (%)
,'lfsa_ewhuis' # Average number of usual weekly hours of work in main job, by sex, professional status, full-time/part-time and occupation (hours)
,'nama_10_gdp' # GDP and main components (output, expenditure and income)
)
EU.countries <-
c('BE'
,'BG'
,'CZ'
,'DK'
,'DE'
,'EE'
,'IE'
,'EL'
,'ES'
,'FR'
,'HR'
,'IT'
,'CY'
,'LV'
,'LT'
,'LU'
,'HU'
,'MT'
,'NL'
,'AT'
,'PL'
,'PT'
,'RO'
,'SI'
,'SK'
,'FI'
,'SE'
,'UK'
)
EA.countries <-
c('BE'
,'DE'
,'EE'
,'IE'
,'EL'
,'ES'
,'FR'
,'IT'
,'CY'
,'LV'
,'LT'
,'LU'
,'MT'
,'NL'
,'AT'
,'PT'
,'SI'
,'SK'
,'FI'
)
dta.list <-
codes %>%
sapply(. %>%
msg %>%
eurodata::importData() %>%
data.table::as.data.table() %>%
.[, time := time %>% factorToInteger] %>%
.[, flags_ := NULL])
dta.list.modif <-
dta.list %>%
sapply(. %>% {NULL})
transformsFor__lfsa_ergaed__lfsa_urgaed <-
function(.list,
.names=c('lfsa_ergaed' # Employment rates by sex, age and educational attainment level (%)
,'lfsa_urgaed'))
sapply(.names,
function(name)
if (name %in% .names)
.list[[name]] %>%
.[time == max(time)] %>%
.[geo %in% EU.countries] %>%
.[age == 'Y20-64'] %>%
.[isced11 %in% c('TOTAL','ED0-2')] %>%
.[unit == 'PC'],
simplify=FALSE)
dta.list.modif <-
transformsFor__lfsa_ergaed__lfsa_urgaed(dta.list)
dta.list.modif$lfsa_ewhuis <-
dta.list$lfsa_ewhuis %>%
.[time == max(time)] %>%
.[geo %in% EU.countries] %>%
.[sex == 'F'] %>%
.[isco08 %in% c('TOTAL')] %>%
.[worktime == 'TOTAL'] %>%
.[wstatus == 'EMP'] %>%
.[unit == 'HR']
dta.list.modif$nama_10_gdp <-
dta.list$nama_10_gdp %>%
.[na_item == 'B1GQ'] %>%
.[time == max(time)] %>%
.[unit == 'CP_MEUR']
dta.list.modif.reshaped <-
names(dta.list.modif) %>%
sapply(function(name)
dta.list.modif[[name]] %>%
dimsWithDimNames %>%
data.table::dcast(geo ~ ...,
value.var='value_',
sep=', ') %>%
data.table::setnames(colnames(.) %except% 'geo',
paste0(name,': ',colnames(.) %except% 'geo')))
dta.all <-
Reduce(function(dt1,dt2)
merge(dt1, dt2, by='geo'),
dta.list.modif.reshaped) %>%
.[, gdp := .[[colnames(.) %>% grep('nama_10_gdp',.,value=TRUE)]]] # shorter alias
indic.colnames <-
colnames(dta.all) %>%
grep('nama|gdp|geo',.,value=TRUE, invert=TRUE)
summary.stats <-
dta.all %>%
data.table::melt(id.vars='geo',
measure.vars=indic.colnames) %>%
merge(dta.all[, c('geo','gdp')], by='geo') %>%
.[,
.(`EU weighted average` = weightedAvg(geo, value, gdp, EU.countries)
,`Euro Area weighted average` = weightedAvg(geo, value, gdp, EA.countries)
,`Relatively high` = LAFplus(geo, value, gdp, EU.countries)
,`Relatively low` = LAFminus(geo, value, gdp, EU.countries))
,by='variable'] # across geo
col.names <-
summary.stats$variable %>%
as.character
stats.names <-
colnames(summary.stats) %except% 'variable'
final.table <-
dta.all %>%
.[, gdp := NULL] %>%
.[order(match(geo, EU.countries))] %>%
rbind(summary.stats[, -'variable'] %>%
data.table::transpose() %>%
data.table::setnames(colnames(.),
col.names) %>%
.[, geo := stats.names],
fill=TRUE) %T>%
openxlsx::write.xlsx('Estat data for TAF.xlsx')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment