Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save johnjosephhorton/5621448 to your computer and use it in GitHub Desktop.
Save johnjosephhorton/5621448 to your computer and use it in GitHub Desktop.
Get country-specific wages from Wikipedia and plot them with ggplot2
############################################################################
# AUTHOR: John Horton
# PURPOSE: Extract by-country minimum hourly wages from the Wikipedia page
# LAST MODIFIED: May 22, 2013
############################################################################
library(XML)
library(ggplot2)
library(scales)
url <- "http://en.wikipedia.org/wiki/List_of_minimum_wages_by_country"
raw <- readHTMLTable(doc=url)
df.raw <- raw[[2]]
colnames(df.raw) <- c("country", "minimum_wage", "annual", "annual_ppp", "workweek",
"hourly_usd", "hourly_intl", "perc_2011_gdp",
"effective")
Clean.Wage <- Vectorize(function(x){
"There is some HTML junk & idiosyncracies in the wages that this fixed"
x1 <- gsub("US$", "", x) # some have US$ prefix
x2 <- substring(x1, first = 20) #get rid of span meta-data that XML picks up
as.numeric(x2)
})
Clean.Country <- Vectorize(function(x) {
"Country has a '_' appended to it--this strips it out"
substring(x, first = 2)
})
df <- with(df.raw, data.frame(country = Clean.Country(country), min.wage = Clean.Wage(hourly_usd)))
Make.MW.plot <- function(df, label){
"This makes a minimum wage plot per country based on a passed data frame.
There are a large number of countries, so the idea here is to split the
data into subsets & plot them individually.
"
title = paste("Hourly minimum wages by country \n", label, sep = "")
qplot(country, min.wage, data = df) +
ylab("Hourly Wage (USD) \n \n Source: Wikipedia, May, 21, 2013
en.wikipedia.org/wiki/List_of_minimum_wages_by_country") +
xlab("") +
scale_y_continuous(labels = dollar) + coord_flip() +
theme_bw() + expand_limits(y = 0) +
ggtitle(title)
}
# Split countries into quartiles
## > summary(df$min.wage)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.030 0.490 1.180 2.308 2.460 16.450
g.75 <- Make.MW.plot(subset(df, min.wage > 2.5), label = "> 2.50")
g.50 <- Make.MW.plot(subset(df, min.wage > 1.25 & min.wage <= 2.50), label = "1.25 < Minimum Wage <= 2.50")
g.25 <- Make.MW.plot(subset(df, min.wage > 0.50 & min.wage <= 1.25), label = "0.50 < Minimum Wage <= 1.25")
g.0 <- Make.MW.plot(subset(df, min.wage <= 0.50), label = "Minimum Wage <= 0.50")
Write.Image <- function(filename, g, width = 500, height = 500, format = "png"){
"Writes a passed ggplot, g, to the filename. The default format is png."
do.call(format, list(filename, width, height))
print(g)
dev.off()
}
Write.Image("./minimum_wage_plots/quartile_75.png", g.75)
Write.Image("./minimum_wage_plots/quartile_50.png", g.50)
Write.Image("./minimum_wage_plots/quartile_25.png", g.25)
Write.Image("./minimum_wage_plots/quartile_0.png", g.0)
# Bonus plot - kernel density estimate of distribution of hourly minimums
g.distro <- qplot(min.wage, geom="density", data = df) +
scale_x_log10(labels = dollar) +
xlab("Hourly minimum wages in USD, log scale \n Source: Wikipedia, May, 21, 2013
en.wikipedia.org/wiki/List_of_minimum_wages_by_country") +
theme_bw()
Write.Image("./minimum_wage_plots/distr.png", g.distro)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment