Created
March 18, 2012 19:15
-
-
Save gauden/2079959 to your computer and use it in GitHub Desktop.
Use data from Gapminder in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ------------------------------------------------------------------------------ | |
# PREPARATION | |
# First download two Excel files from Gapminder, | |
# I am here using the WHO alcohol consumption data from: | |
# http://spreadsheets.google.com/pub?key=0AgogXXPMARyldGJqTDRfNHBWODJMRWlZaVhNclhNZXc&output=xls | |
# And the World Bank GDP data from: | |
# http://spreadsheets.google.com/pub?key=0ArfEDsV3bBwCdHh3d1FPOVg1WXM3V2huRWc2cjM3TkE&output=xls | |
# | |
# Note that Gapminder uses '..' in cells to mark missing values | |
# Do a search and replace in Excel, replacing '..' with 'NA' (no quotes) | |
# ------------------------------------------------------------------------------ | |
# IMPORT THE DATA | |
library("gdata") | |
alcohol <- read.xls(xls='path/to/alcoholfile.xls') # insert correct path | |
gdp <- read.xls(xls='path/to/gdpfile.xls') # insert correct path | |
# delete extra columns from the gdp dataframe (we only want to use 2005) | |
gdp <- gdp[,c('X','X2005')] | |
# merge the dataframes, joining by the country name | |
m <- merge(gdp, alcohol, by.x='X', by.y='X') | |
# relabel the columns | |
names(m)[names(m)=='X'] <- 'Country' | |
names(m)[names(m)=='X2005.x'] <- 'GDP' | |
names(m)[names(m)=='X2005.y'] <- 'alcohol' | |
# ------------------------------------------------------------------------------ | |
# MANIPULATE AND VISUALIZE THE DATA | |
# peep at the top of the new dataframe | |
head(m) | |
summary(m) # summarize the three variables | |
plot(m) # and visualize their distribution | |
# plot the scatter of alcohol against the log(GDP) plus add smoothing | |
library("ggplot2") | |
p <- ggplot(m, aes(x=log(GDP), y=alcohol)) | |
p <- p + geom_point() | |
p <- p + geom_smooth() | |
p |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment