Last active
August 29, 2015 14:02
-
-
Save gabrielflorit/ac7ef3ad8ce7ce61e77f to your computer and use it in GitHub Desktop.
Gender parity in HarvardX and MITx courses (2012-13)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# data: http://thedata.harvard.edu/dvn/dv/mxhx | |
# notes: country is determined by mix of administrative (computed from IP address) and user-provided (filled in from student address if available when IP was indeterminate); during de-identification, some country names were replaced with the corresponding continent/region name. | |
setwd("~/Desktop/edX") | |
library('lattice') | |
library('ggplot2') | |
data <- read.csv('HMXPC13_DI_v2_5-14-14.csv') | |
d <- data | |
# get count by female | |
f <- d[grepl('f', d$gender), ] | |
female <- as.data.frame(table(f$final_cc_cname_DI)) | |
names(female) <- c('country', 'female') | |
# get count by male | |
# TODO: this is SILLY there has to be a better way of aggregating by gender | |
m <- d[grepl('m', d$gender), ] | |
male <- as.data.frame(table(m$final_cc_cname_DI)) | |
names(male) <- c('country', 'male') | |
# merge male and female counts by country column | |
total <- merge(female,male,by='country') | |
total <- total[total$male > 0, ] | |
total$ratio <- total$female/total$male | |
# plot | |
ggplot(total, aes(x = reorder(country,ratio),)) + | |
geom_point(stat="identity", ymin=0, aes(y=ratio, ymax=ratio)) + | |
geom_text(aes(x=country, | |
y=ratio, | |
ymin=-0.05, | |
ymax=ratio, | |
label=country, | |
hjust=1)) + | |
theme(axis.text.y = element_blank(), | |
axis.ticks.y = element_blank(), | |
axis.title.y = element_blank()) + | |
ylab('gender parity') + | |
coord_flip() + | |
ggtitle('Gender parity in HarvardX and MITx courses (2012-13)') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment