Skip to content

Instantly share code, notes, and snippets.

@mfmakahiya

mfmakahiya/data_prep.r

Last active Oct 25, 2018
Embed
What would you like to do?
rm(list=ls())
library(memisc)
library(assertthat)
library(sqldf)
library(magrittr)
library(dplyr)
library(reshape2)
library(ggplot2)
library(oz)
library(scatterpie)
library(rgdal)
library(maptools)
# Helper functions
data_prep <- function(csv_name) {
import <- read.csv(paste0("./Data/", csv_name), stringsAsFactors = F)
names(import) <- tolower(names(import))
import <- import[, c("country.or.area",
"sex",
"age",
"cause.of.death..who.",
"record.type",
"value")]
names(import) <- c("country", "sex", "age", "death_cause", "record_type", "value")
assert_that(length(names(import)) == 6)
return(import)
}
###############
perform_groupby <- function(data, sex_filter, record_filter) {
#data <- import
#sex_filter <- "Female"
#record_filter <- "Data tabulated by year of occurrence"
final_data <- data %>%
filter (!sex == sex_filter, record_type == record_filter) %>%
group_by(country, death_cause) %>%
summarise(count = sum(value)) %>%
ungroup %>%
as.data.frame()
return(final_data)
}
pivot_by_country <- function(data) {
s1 = melt(data, id = c("country", "death_cause"), measure.vars = "count")
s2 = dcast(s1, country ~ death_cause, sum)
s2$Total = rowSums(s2[,2:NCOL(s2)])
return(s2)
}
# Main Logic
death_data <- data_prep("death2014.csv")
grouped_data <- perform_groupby(death_data, "", "Data tabulated by year of occurrence") # Data tabulated by year of occurrence # Data tabulated by year of registration
grouped_data$death_cause <- gsub("*, ICD10", "", grouped_data$death_cause)
grouped_data$death_cause <- gsub("Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified", "Abnormal clinical and lab findings", grouped_data$death_cause)
# Top 10 leading cause overall
overall_data <- grouped_data %>%
group_by(death_cause) %>%
summarise(totalcount = sum(count)) %>%
ungroup %>%
as.data.frame()
overall_data <- overall_data[order(-overall_data$totalcount), ]
top_ten_causes <- overall_data[2:10, "death_cause"]
top_ten_causes <- gsub("*, ICD10", "", top_ten_causes)
grouped_data$death_cause2 <- grouped_data$death_cause
grouped_data1 <- grouped_data %>%
filter (death_cause %in% top_ten_causes)
grouped_data2 <- grouped_data %>%
filter (!death_cause %in% c(top_ten_causes, "All causes"))
grouped_data2$death_cause2 <- "Others"
grouped_data <- rbind(grouped_data1[, c(1,3,4)], grouped_data2[, c(1,3,4)])
names(grouped_data) <- c("country", "count", "death_cause")
pivotted_data <- pivot_by_country(grouped_data)
# Getting the coordinates of each country
country_lookup <- read.csv(paste0("./Data/", "countries.csv"), stringsAsFactors = F)
names(country_lookup)[1] <- "country_code"
# Combining data
final_data <- merge(x = pivotted_data, y = country_lookup, by.x = "country", by.y = "name", all.x = T)
# Data cleaning for plotting
final_data <- unique(final_data)
multiplier <- log10(final_data$Total) / log10(max(final_data$Total))
final_data <- cbind(final_data, multiplier)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment