Skip to content

Instantly share code, notes, and snippets.

@ko-lem
Last active August 29, 2015 14:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ko-lem/9f37119dedda30fa9493 to your computer and use it in GitHub Desktop.
Save ko-lem/9f37119dedda30fa9493 to your computer and use it in GitHub Desktop.
Exploring DepEd Enrollment Data
require(ggplot2)
require(reshape)
require(scales)
setwd("~/Dropbox/Projects/deped-enrollment")
which.data <- 'elementary' # elementary or secondary
num.enrollees <- read.csv(paste('data/num-enrollees-', which.data, '.csv', sep=''))
# Enrollees growth through the years by Gender
series.sum.by.gender <- aggregate(Count ~ Year + Gender, data=num.enrollees, FUN=sum)
ggplot(series.sum.by.gender, aes(x=Year, y=Count, colour=Gender)) +
geom_point(shape=1) +
geom_line() +
ylab("Number of Enrollees") +
scale_y_continuous(labels=comma)
# Gender differences by Region
counts <- aggregate(Count ~ Year + Gender + Region, data=num.enrollees, FUN=sum)
gender.diffs <- cast(counts, Year + Region ~ Gender, value="Count")
gender.diffs$TotalStudents <- gender.diffs$Female + gender.diffs$Male
gender.diffs$FemaleRatio <- gender.diffs$Female / gender.diffs$TotalStudents
ggplot(gender.diffs, aes(x=Year, y=FemaleRatio, colour=Region)) +
geom_point(shape=1) +
geom_line() +
ylab("Percentage of Female Enrollees") +
scale_y_continuous(labels=comma) +
geom_hline(aes(yintercept=0.5), colour="#990000", linetype="dashed")
ggplot(gender.diffs, aes(x=Year, y=FemaleRatio, colour=Region)) +
geom_point(shape=1) +
ylab("Percentage of Female Enrollees") +
scale_y_continuous(labels=comma) +
geom_hline(aes(yintercept=0.5), colour="#990000", linetype="dashed")
gender.diffs[gender.diffs$Region == "ARMM", ]
unique(gender.diffs[gender.diffs$FemaleRatio < 0.5, ]$Region)
length(unique(gender.diffs[gender.diffs$FemaleRatio < 0.5, ]$Region))
gender.diffs[which.min(gender.diffs$FemaleRatio), ]
# Enrollees growth through the years by Region
series.sum.by.region <- aggregate(Count ~ Year + Region, data=num.enrollees, FUN=sum)
if (which.data == 'secondary') {
color.upper.limit <- 400000
color.lower.limit <- 100000
other.of.interest <- c()
} else if (which.data == 'elementary') {
color.upper.limit <- 1000000
color.lower.limit <- 400000
armm.index <- match("ARMM", levels(num.enrollees$Region))
other.of.interest <- c(armm.index)
}
series.sum.by.region[series.sum.by.region$Year == 2005 &
(series.sum.by.region$Count > color.upper.limit |
series.sum.by.region$Count < color.lower.limit),
]$Region -> regions.to.color
region.colors <- rainbow(length(levels(num.enrollees$Region)))
line.colors <- rep("#7f7f7f", length(levels(num.enrollees$Region)))
region.indexes.to.color <- match(regions.to.color, levels(num.enrollees$Region))
region.indexes.to.color <- c(region.indexes.to.color, other.of.interest)
line.colors[region.indexes.to.color] <- region.colors[region.indexes.to.color]
ggplot(series.sum.by.region, aes(x=Year, y=Count, colour=Region)) +
geom_point(shape=1) +
geom_line() +
ylab("Number of Enrollees") +
scale_y_continuous(labels=comma) +
scale_color_manual(values=line.colors)
sum(num.enrollees[num.enrollees$Year == 2012, ]$Count)
setwd("~/Dropbox/Projects/deped-enrollment")
which.data <- 'secondary'
input.filename.head <- 'data/deped-total-school-enrollment-for-public'
input.filename.tail <- 'schools-2005-to-2012.csv'
input.filename <- paste(input.filename.head, which.data, input.filename.tail, sep='-')
data <- read.csv(input.filename, skip=7)
# name this column
colnames(data)[3] <- "Gender"
# fill up rows with blank regions
regions <- data$Region[seq(from=1, to=nrow(data), by=3)]
data$Region[seq(from=2, to=nrow(data), by=3)] <- regions
data$Region[seq(from=3, to=nrow(data), by=3)] <- regions
# fill up rows with blank divisions
divisions <- data$Division[seq(from=1, to=nrow(data), by=3)]
data$Division[seq(from=2, to=nrow(data), by=3)] <- divisions
data$Division[seq(from=3, to=nrow(data), by=3)] <- divisions
# get rid of the *total rows
data <- data[!grepl('Subtotal', data$Region),]
data <- data[!grepl('Grand total', data$Region),]
data <- data[!grepl('Total', data$Gender),]
# get rid of the unnecessary special rownames column
rownames(data) <- seq_len(nrow(data))
## each measurement of enrollees for different years
## should be in its own row
## i'm thinking this could be implemeted using reshape's melt
years <- 2005:2012
N <- nrow(data) * length(years)
num.enrollees <- data.frame(Region= as.factor(sapply(data$Region, function(f) rep(f, length(years)))),
Division= as.factor(sapply(data$Division, function(f) rep(f, length(years)))),
Gender= as.factor(sapply(data$Gender, function(f) rep(f, length(years)))),
Year=rep(NA, N),
Count=rep(NA, N))
row.num <- 0
for (i in 1:nrow(data)) {
for (year in years) {
row.num <- row.num + 1
count <- data[i, paste("Enrollment", year, sep="_")]
num.enrollees[row.num, "Year"] <- year
num.enrollees[row.num, "Count"] <- count
}
}
num.enrollees$Year <- as.factor(num.enrollees$Year)
output.filename <- paste('data/num-enrollees-', which.data, '.csv', sep='')
write.csv(num.enrollees, file=output.filename, row.names=FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment