Skip to content

Instantly share code, notes, and snippets.

@ajaypillarisetti
Created October 18, 2016 22:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ajaypillarisetti/2b0417443a6b9ff8d7f8ae0913b50355 to your computer and use it in GitHub Desktop.
Save ajaypillarisetti/2b0417443a6b9ff8d7f8ae0913b50355 to your computer and use it in GitHub Desktop.
GBD Data Download
library(data.table)
library(httr)
library(plyr)
library(tools)
#get data from web - don't run every time!
urls <- read.table('GBD_links.txt')
urls <- as.data.table(urls)
urls <- grep("http", urls$V1, value=T)
downloadR <- function(x){
GET(x, write_disk(paste('raw_data/',strsplit(x, "gbd2013results/")[[1]][2], sep=""), overwrite=TRUE))
}
l_ply(urls, downloadR, .progress='text')
downloadFiles <- list.files('raw_data')
urlFiles <- sapply(strsplit(urls, "gbd2013results/"),'[[',2)
unique(urlFiles)
urlFiles[!(urlFiles %in% downloadFiles)]
# Age Key - "1-Under 5", "2-Early Neonatal", "3-Late Neonatal", "4-Post Neonatal", "5-1 to 4", "6-5 to 9", "7-10 to 14", "8-15 to 19", "9-20 to 24", "10-25 to 29", "11-30 to 34", "12-35 to 39", "13-40 to 44", "14-45 to 49", "15-50 to 54", "16-55 to 59", "17-60 to 64", "18-65 to 69", "19-70 to 74", "20-75 to 79", "21-80 plus", "22-All Ages", "23-5-14 years", "24-15-49 years", "25-50-69 years", "26-70+ years", "27-Age-standardized"
csvs <- list.files()
geographies <- unique(gsub('IHME-Data-|.csv|-DALYs|-YLDs|-YLLs|-Deaths','',csvs))
exclusions <- c('World Bank', 'WHO', ' - WB', 'Region', 'African Union', 'Africa', 'Andean Latin America', 'America', 'Caribbean', 'Income', 'income', 'Europe', 'World', 'Asia', 'Australasia', 'Developed', 'Developing', 'Commonwealth', 'Gulf Cooperation Council', 'ECA', 'ECE', 'ECLAC', 'EU-15', 'ECLAC', 'ESCAP', 'EU AND EFTA', 'ESCWA', 'G20', 'Global', 'EU and EFTA', 'OECD Countries', 'Oceania')
mexican_states <- c("Chihuahua", "Sonora", "Coahuila", "Durango", "Oaxaca", "Tamaulipas", "Jalisco", "Zacatecas", "Baja California Sur", "Chiapas", "Veracruz", "Baja California", "Nuevo León", "Guerrero", "San Luis Potosí", "Michoacán de Ocampo", "Campeche", "Sinaloa", "Quintana Roo", "Querétaro", "Yucatán", "Puebla", "Guanajuato", "Nayarit", "Tabasco", "México", "Hidalgo", "Querétaro", "Colima", "Aguascalientes", "Morelos", "Tlaxcala", "Mexico City", "Distrito Federal", "San Luis Potosí", "Yucatán", "Nuevo León")
chinese_provinces <- c("Anhui", "Fujian", "Gansu", "Guangdong", "Guizhou", "Hainan", "Hebei", "Heilongjiang", "Henan", "Hubei", "Hunan", "Jiangsu", "Jiangxi", "Jilin", "Liaoning", "Qinghai", "Shaanxi", "Shandong", "Shanxi", "Sichuan", "Yunnan", "Zhejiang", "Taiwan", "Chongqing", "Guangxi", "Ningxia", "Xinjiang", "Shanghai", "Tianjin", "Inner Mongolia")
english_regions <- c("North East", "East Midlands", "Yorkshire and the Humber", "South West", "West Midlands", "East of England", "North West", "London", "South East")
csvs <- csvs[!csvs %like% paste(c(exclusions, english_regions), collapse="|")]
exclusions <- paste(c(exclusions, english_regions), collapse="|")
mungeR <- function(x){
#import full file
hap <- fread(x, showProgress=F)
#select for both sexes in 2013, All Ages, all risk factors
hap2013all <- hap[year==2013 & sex_name=='Both' & age_name=='All Ages' & risk_name=="All risk factors"]
#select for both sexes in 2013, 25+
hap2013gte25 <- hap[year==2013 & age %in% c(10:21) & sex_name=='Both' & risk_name=="All risk factors"]
#select burden for ALRI only in <5
alri2013 <- hap[year==2013 & sex_name=='Both' & age==1 & cause_name=="Lower respiratory infections" & risk_name=="All risk factors" ]
#non-ALRI disease causes
causes <- c('All causes', 'Tracheal, bronchus and lung cancer', 'Ischemic heart disease', 'Chronic obstructive pulmonary disease','Cerebrovascular disease', 'Cataract')
#split all and ages 25+
hap2013all <- hap2013all[cause_name %in% causes]
hap2013gte25 <- hap2013gte25[cause_name %in% causes]
#row bind all data
hap2013 <- rbind(hap2013all, hap2013gte25, alri2013)
cols <- colnames(hap2013)
#select only columns of interest
#location, cause, sex, age, year, mean, lower, upper, metric
hap2013 <- hap2013[,c(cols[c(2,4,8,10,11,13,14,15,23)]), with=F]
#split apart by Under 5 + All Ages and 25+
hap2013all <- hap2013[(age_name %in% c('Under 5', 'All Ages'))]
#Collapse the age groups for 25+ by summing the nm_mean, nm_lower, and nm_upper and add a new age_name (25+)
hap2013gte25 <- hap2013[!(age_name %in% c('Under 5', 'All Ages')), list(age_name="25+", nm_mean=sum(nm_mean), nm_lower=sum(nm_lower), nm_upper=sum(nm_upper)), by='location_name,cause_name,sex_name,year,metric_name']
#bind all together
hap2013 <- rbind(hap2013all, hap2013gte25)
# save as rds
filename <- gsub('.csv', '.rds', x)
saveRDS(hap2013, file=paste('../rds_allrisk/',filename, sep=""))
message(paste(filename, 'saved'))
}
l_ply(csvs, mungeR, .parallel=T)
files <- list.files('/Volumes/Petunia/GBD2013/raw_data/rds_allrisk', pattern="rds", full.names=T)
geographies <- unique(gsub('/Volumes/Petunia/GBD2013/raw_data/rds_allrisk/IHME-Data-|.rds|-DALYs|-YLDs|-YLLs|-Deaths','',files))
geographies <- geographies[!geographies %like% exclusions]
rdss <- files
combineR <- function(x){
message(paste("Try geography '",x,"'",sep=""))
toCombine <- grep(paste("\\b",x,"\\b",sep=""), rdss, value=T)
lCombined <- lapply(toCombine, readRDS)
combined <- do.call(rbind, lCombined)
#clean
setnames(combined, c('country_name', 'cause', 'sex', 'age_name', 'year', 'nm_mean', 'nm_lower', 'nm_upper', 'measure'))
combined[measure=='DALYs', measure:='dalys']
combined[measure=='Deaths', measure:='death']
combined[measure=='YLDs', measure:='yld']
combined[measure=='YLLs', measure:='yll']
combined[cause=='Tracheal, bronchus and lung cancer', cause:='LC']
combined[cause=='Lower respiratory infections', cause:='ALRI']
combined[cause=='Ischemic heart disease', cause:='IHD']
combined[cause=='All causes', cause:='Total']
combined[cause=='Chronic obstructive pulmonary disease', cause:='COPD']
combined[cause=='Cerebrovascular disease', cause:='Stroke']
combined[age_name=='All Ages', age:='AllAge']
combined[age_name=='Under 5', age:='Under 5 years']
combined[age_name=='Under 5', age_name:='Under 5 years']
combined[age_name=='25+', age:='25 and over']
combined[age_name=='25+', age_name:='25 and over']
#save out
saveRDS(combined, paste('~/Dropbox/HAPIT3/data/gbd2013/', gsub("_","",x), '.rds', sep="") )
}
l_ply(geographies, combineR)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment