Skip to content

Instantly share code, notes, and snippets.

@RHDZMOTA
Last active March 27, 2016 23:03
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save RHDZMOTA/e77848e592c6d592e8ff to your computer and use it in GitHub Desktop.
Save RHDZMOTA/e77848e592c6d592e8ff to your computer and use it in GitHub Desktop.
Download and clean data from the World Health Organization - Global Health Observatory: Consumption of pure alcohol by type of beverage. The database contains data from 2010 of nearly 50 different countries.
# Radar Plots
# Example: consumption of pure alcohol by type of beverage
library(tibble)
# Download data -----------------------------------------------------------
# Data by country and type of beverage
url <- "http://apps.who.int/gho/athena/data/data-text.csv?target=GHO/SA_0000001398&profile=text&filter=COUNTRY:*;REGION:EUR;ALCOHOLTYPE:*"
# Download and read files
file_name <- "consum_type"
download.file(url, file_name)
dataset <- read.table(file = file_name, header = TRUE, sep = ",",
stringsAsFactors = FALSE, na.strings = FALSE)
# Clean data --------------------------------------------------------------
library(tibble)
# Use data_frames to manipulate data.
# Index created to help filter relevant data
# Some columns are useless
dataset <- as_data_frame(dataset)
dataset <- dataset[, c("Country", "Beverage.Types", "Numeric")]
index <- 1:nrow(dataset)
dataset$Index <- index
# Unique countries, problem: some names are too long
# Solve problem (avoid loops when possible)
countries_unique <- unique(dataset$Country)
countries_problem <- c("Russian Federation",
"United Kingdom of Great Britain and Northern Ireland",
"The former Yugoslav republic of Macedonia",
"Republic of Moldova")
countries_solution <- c("Russia", "UK", "Macedonia", "Moldova")
countries_problem_index <- sapply(X = countries_problem,
FUN = function (x) dataset[dataset$Country == x, "Index"]$Index )
for(i in 1:ncol(countries_problem_index)){
dataset[countries_problem_index[,i], "Country"] <- countries_solution[i]
}
# Other problem: "Other" category is too large
# Solve problem (avoid loops when possible)
other_inconvinient <- dataset[dataset$Beverage.Types == "Other alcoholic beverages", "Index"]$Index
dataset[other_inconvinient, "Beverage.Types"] <- "Other"
# Add row to the data frame that contains the average consumption
beverage_unique <- unique(dataset$Beverage.Types)
beverage_average <- sapply(X = sapply(X = beverage_unique,
FUN = function(x)
dataset[dataset$Beverage.Types == x, "Numeric"]),
FUN = mean, na.rm = T)
names(beverage_average) <- beverage_unique
Numeric <- beverage_average
Beverage.Types <- beverage_unique
Country <- rep("Average Country", length(beverage_unique))
Index <- rep(as.character(nrow(dataset)+1), length(beverage_unique))
df_aux <- data_frame(Country = Country, Beverage.Types = Beverage.Types,
Numeric = Numeric, Index = Index)
dataset <- rbind(dataset, df_aux)
dataset <- dataset[order(dataset$Beverage.Types), ]
df_aux <- df_aux[order(df_aux$Beverage.Types),]
avrg_df <- df_aux
# Identify position (index) of average country
# for future use in plot
countries_unique <- unique(dataset$Country)
countries_unique <- countries_unique[order(countries_unique)]
countries_unique_index <- 1:length(countries_unique)
df_aux <- data_frame(countries = countries_unique,
index = countries_unique_index)
avr_country <- "Average Country"
avr_index <- df_aux[df_aux$countries == avr_country, "index"]$index
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment