Created
October 18, 2016 22:58
-
-
Save ajaypillarisetti/2b0417443a6b9ff8d7f8ae0913b50355 to your computer and use it in GitHub Desktop.
GBD Data Download
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
library(httr) | |
library(plyr) | |
library(tools) | |
#get data from web - don't run every time! | |
urls <- read.table('GBD_links.txt') | |
urls <- as.data.table(urls) | |
urls <- grep("http", urls$V1, value=T) | |
downloadR <- function(x){ | |
GET(x, write_disk(paste('raw_data/',strsplit(x, "gbd2013results/")[[1]][2], sep=""), overwrite=TRUE)) | |
} | |
l_ply(urls, downloadR, .progress='text') | |
downloadFiles <- list.files('raw_data') | |
urlFiles <- sapply(strsplit(urls, "gbd2013results/"),'[[',2) | |
unique(urlFiles) | |
urlFiles[!(urlFiles %in% downloadFiles)] | |
# Age Key - "1-Under 5", "2-Early Neonatal", "3-Late Neonatal", "4-Post Neonatal", "5-1 to 4", "6-5 to 9", "7-10 to 14", "8-15 to 19", "9-20 to 24", "10-25 to 29", "11-30 to 34", "12-35 to 39", "13-40 to 44", "14-45 to 49", "15-50 to 54", "16-55 to 59", "17-60 to 64", "18-65 to 69", "19-70 to 74", "20-75 to 79", "21-80 plus", "22-All Ages", "23-5-14 years", "24-15-49 years", "25-50-69 years", "26-70+ years", "27-Age-standardized" | |
csvs <- list.files() | |
geographies <- unique(gsub('IHME-Data-|.csv|-DALYs|-YLDs|-YLLs|-Deaths','',csvs)) | |
exclusions <- c('World Bank', 'WHO', ' - WB', 'Region', 'African Union', 'Africa', 'Andean Latin America', 'America', 'Caribbean', 'Income', 'income', 'Europe', 'World', 'Asia', 'Australasia', 'Developed', 'Developing', 'Commonwealth', 'Gulf Cooperation Council', 'ECA', 'ECE', 'ECLAC', 'EU-15', 'ECLAC', 'ESCAP', 'EU AND EFTA', 'ESCWA', 'G20', 'Global', 'EU and EFTA', 'OECD Countries', 'Oceania') | |
mexican_states <- c("Chihuahua", "Sonora", "Coahuila", "Durango", "Oaxaca", "Tamaulipas", "Jalisco", "Zacatecas", "Baja California Sur", "Chiapas", "Veracruz", "Baja California", "Nuevo León", "Guerrero", "San Luis Potosí", "Michoacán de Ocampo", "Campeche", "Sinaloa", "Quintana Roo", "Querétaro", "Yucatán", "Puebla", "Guanajuato", "Nayarit", "Tabasco", "México", "Hidalgo", "Querétaro", "Colima", "Aguascalientes", "Morelos", "Tlaxcala", "Mexico City", "Distrito Federal", "San Luis Potosí", "Yucatán", "Nuevo León") | |
chinese_provinces <- c("Anhui", "Fujian", "Gansu", "Guangdong", "Guizhou", "Hainan", "Hebei", "Heilongjiang", "Henan", "Hubei", "Hunan", "Jiangsu", "Jiangxi", "Jilin", "Liaoning", "Qinghai", "Shaanxi", "Shandong", "Shanxi", "Sichuan", "Yunnan", "Zhejiang", "Taiwan", "Chongqing", "Guangxi", "Ningxia", "Xinjiang", "Shanghai", "Tianjin", "Inner Mongolia") | |
english_regions <- c("North East", "East Midlands", "Yorkshire and the Humber", "South West", "West Midlands", "East of England", "North West", "London", "South East") | |
csvs <- csvs[!csvs %like% paste(c(exclusions, english_regions), collapse="|")] | |
exclusions <- paste(c(exclusions, english_regions), collapse="|") | |
mungeR <- function(x){ | |
#import full file | |
hap <- fread(x, showProgress=F) | |
#select for both sexes in 2013, All Ages, all risk factors | |
hap2013all <- hap[year==2013 & sex_name=='Both' & age_name=='All Ages' & risk_name=="All risk factors"] | |
#select for both sexes in 2013, 25+ | |
hap2013gte25 <- hap[year==2013 & age %in% c(10:21) & sex_name=='Both' & risk_name=="All risk factors"] | |
#select burden for ALRI only in <5 | |
alri2013 <- hap[year==2013 & sex_name=='Both' & age==1 & cause_name=="Lower respiratory infections" & risk_name=="All risk factors" ] | |
#non-ALRI disease causes | |
causes <- c('All causes', 'Tracheal, bronchus and lung cancer', 'Ischemic heart disease', 'Chronic obstructive pulmonary disease','Cerebrovascular disease', 'Cataract') | |
#split all and ages 25+ | |
hap2013all <- hap2013all[cause_name %in% causes] | |
hap2013gte25 <- hap2013gte25[cause_name %in% causes] | |
#row bind all data | |
hap2013 <- rbind(hap2013all, hap2013gte25, alri2013) | |
cols <- colnames(hap2013) | |
#select only columns of interest | |
#location, cause, sex, age, year, mean, lower, upper, metric | |
hap2013 <- hap2013[,c(cols[c(2,4,8,10,11,13,14,15,23)]), with=F] | |
#split apart by Under 5 + All Ages and 25+ | |
hap2013all <- hap2013[(age_name %in% c('Under 5', 'All Ages'))] | |
#Collapse the age groups for 25+ by summing the nm_mean, nm_lower, and nm_upper and add a new age_name (25+) | |
hap2013gte25 <- hap2013[!(age_name %in% c('Under 5', 'All Ages')), list(age_name="25+", nm_mean=sum(nm_mean), nm_lower=sum(nm_lower), nm_upper=sum(nm_upper)), by='location_name,cause_name,sex_name,year,metric_name'] | |
#bind all together | |
hap2013 <- rbind(hap2013all, hap2013gte25) | |
# save as rds | |
filename <- gsub('.csv', '.rds', x) | |
saveRDS(hap2013, file=paste('../rds_allrisk/',filename, sep="")) | |
message(paste(filename, 'saved')) | |
} | |
l_ply(csvs, mungeR, .parallel=T) | |
files <- list.files('/Volumes/Petunia/GBD2013/raw_data/rds_allrisk', pattern="rds", full.names=T) | |
geographies <- unique(gsub('/Volumes/Petunia/GBD2013/raw_data/rds_allrisk/IHME-Data-|.rds|-DALYs|-YLDs|-YLLs|-Deaths','',files)) | |
geographies <- geographies[!geographies %like% exclusions] | |
rdss <- files | |
combineR <- function(x){ | |
message(paste("Try geography '",x,"'",sep="")) | |
toCombine <- grep(paste("\\b",x,"\\b",sep=""), rdss, value=T) | |
lCombined <- lapply(toCombine, readRDS) | |
combined <- do.call(rbind, lCombined) | |
#clean | |
setnames(combined, c('country_name', 'cause', 'sex', 'age_name', 'year', 'nm_mean', 'nm_lower', 'nm_upper', 'measure')) | |
combined[measure=='DALYs', measure:='dalys'] | |
combined[measure=='Deaths', measure:='death'] | |
combined[measure=='YLDs', measure:='yld'] | |
combined[measure=='YLLs', measure:='yll'] | |
combined[cause=='Tracheal, bronchus and lung cancer', cause:='LC'] | |
combined[cause=='Lower respiratory infections', cause:='ALRI'] | |
combined[cause=='Ischemic heart disease', cause:='IHD'] | |
combined[cause=='All causes', cause:='Total'] | |
combined[cause=='Chronic obstructive pulmonary disease', cause:='COPD'] | |
combined[cause=='Cerebrovascular disease', cause:='Stroke'] | |
combined[age_name=='All Ages', age:='AllAge'] | |
combined[age_name=='Under 5', age:='Under 5 years'] | |
combined[age_name=='Under 5', age_name:='Under 5 years'] | |
combined[age_name=='25+', age:='25 and over'] | |
combined[age_name=='25+', age_name:='25 and over'] | |
#save out | |
saveRDS(combined, paste('~/Dropbox/HAPIT3/data/gbd2013/', gsub("_","",x), '.rds', sep="") ) | |
} | |
l_ply(geographies, combineR) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment