ko-lem/explore.R

## explore.R
require(ggplot2)
require(reshape)
require(scales)

setwd("~/Dropbox/Projects/deped-enrollment")

which.data <- 'elementary' # elementary or secondary

num.enrollees <- read.csv(paste('data/num-enrollees-', which.data, '.csv', sep=''))

# Enrollees growth through the years by Gender
series.sum.by.gender <- aggregate(Count ~ Year + Gender, data=num.enrollees, FUN=sum)
ggplot(series.sum.by.gender, aes(x=Year, y=Count, colour=Gender)) +
  geom_point(shape=1) +
  geom_line() +
  ylab("Number of Enrollees") +
  scale_y_continuous(labels=comma)

# Gender differences by Region
counts <- aggregate(Count ~ Year + Gender + Region, data=num.enrollees, FUN=sum)
gender.diffs <- cast(counts, Year + Region ~ Gender, value="Count")
gender.diffs$TotalStudents <- gender.diffs$Female + gender.diffs$Male
gender.diffs$FemaleRatio   <- gender.diffs$Female / gender.diffs$TotalStudents
ggplot(gender.diffs, aes(x=Year, y=FemaleRatio, colour=Region)) +
  geom_point(shape=1) +
  geom_line() +
  ylab("Percentage of Female Enrollees") +
  scale_y_continuous(labels=comma) +
  geom_hline(aes(yintercept=0.5), colour="#990000", linetype="dashed")
ggplot(gender.diffs, aes(x=Year, y=FemaleRatio, colour=Region)) +
  geom_point(shape=1) +
  ylab("Percentage of Female Enrollees") +
  scale_y_continuous(labels=comma) +
  geom_hline(aes(yintercept=0.5), colour="#990000", linetype="dashed")
gender.diffs[gender.diffs$Region == "ARMM", ]
unique(gender.diffs[gender.diffs$FemaleRatio < 0.5, ]$Region)
length(unique(gender.diffs[gender.diffs$FemaleRatio < 0.5, ]$Region))
gender.diffs[which.min(gender.diffs$FemaleRatio), ]

# Enrollees growth through the years by Region
series.sum.by.region <- aggregate(Count ~ Year + Region, data=num.enrollees, FUN=sum)
if (which.data == 'secondary') {
  color.upper.limit <- 400000
  color.lower.limit <- 100000
  other.of.interest <- c()
} else if (which.data == 'elementary') {
  color.upper.limit <- 1000000
  color.lower.limit <- 400000

  armm.index <- match("ARMM", levels(num.enrollees$Region))
  other.of.interest <- c(armm.index)
}
series.sum.by.region[series.sum.by.region$Year == 2005 &
                    (series.sum.by.region$Count > color.upper.limit |
                     series.sum.by.region$Count < color.lower.limit),
                    ]$Region -> regions.to.color
region.colors <- rainbow(length(levels(num.enrollees$Region)))
line.colors <- rep("#7f7f7f", length(levels(num.enrollees$Region)))
region.indexes.to.color <- match(regions.to.color, levels(num.enrollees$Region))
region.indexes.to.color <- c(region.indexes.to.color, other.of.interest)
line.colors[region.indexes.to.color] <- region.colors[region.indexes.to.color]

ggplot(series.sum.by.region, aes(x=Year, y=Count, colour=Region)) +
  geom_point(shape=1) +
  geom_line() +
  ylab("Number of Enrollees") +
  scale_y_continuous(labels=comma) +
  scale_color_manual(values=line.colors)


sum(num.enrollees[num.enrollees$Year == 2012, ]$Count)

## process.R
setwd("~/Dropbox/Projects/deped-enrollment")

which.data <- 'secondary'

input.filename.head <- 'data/deped-total-school-enrollment-for-public'
input.filename.tail <- 'schools-2005-to-2012.csv'
input.filename <- paste(input.filename.head, which.data, input.filename.tail, sep='-')
data <- read.csv(input.filename, skip=7)

# name this column
colnames(data)[3] <- "Gender"

# fill up rows with blank regions
regions <- data$Region[seq(from=1, to=nrow(data), by=3)]
data$Region[seq(from=2, to=nrow(data), by=3)] <- regions
data$Region[seq(from=3, to=nrow(data), by=3)] <- regions

# fill up rows with blank divisions
divisions <- data$Division[seq(from=1, to=nrow(data), by=3)]
data$Division[seq(from=2, to=nrow(data), by=3)] <- divisions
data$Division[seq(from=3, to=nrow(data), by=3)] <- divisions

# get rid of the *total rows
data <- data[!grepl('Subtotal', data$Region),]
data <- data[!grepl('Grand total', data$Region),]
data <- data[!grepl('Total', data$Gender),]

# get rid of the unnecessary special rownames column
rownames(data) <- seq_len(nrow(data))

## each measurement of enrollees for different years
## should be in its own row
## i'm thinking this could be implemeted using reshape's melt
years <- 2005:2012
N <- nrow(data) * length(years)
num.enrollees <- data.frame(Region=   as.factor(sapply(data$Region, function(f) rep(f, length(years)))),
                            Division= as.factor(sapply(data$Division, function(f) rep(f, length(years)))),
                            Gender=   as.factor(sapply(data$Gender, function(f) rep(f, length(years)))),
                            Year=rep(NA, N),
                            Count=rep(NA, N))

row.num <- 0
for (i in 1:nrow(data)) {
  for (year in years) {
    row.num <- row.num + 1
    count <- data[i, paste("Enrollment", year, sep="_")]

    num.enrollees[row.num, "Year"]  <- year
    num.enrollees[row.num, "Count"] <- count
  }
}

num.enrollees$Year <- as.factor(num.enrollees$Year)

output.filename <- paste('data/num-enrollees-', which.data, '.csv', sep='')
write.csv(num.enrollees, file=output.filename, row.names=FALSE)
	require(ggplot2)
	require(reshape)
	require(scales)

	setwd("~/Dropbox/Projects/deped-enrollment")

	which.data <- 'elementary' # elementary or secondary

	num.enrollees <- read.csv(paste('data/num-enrollees-', which.data, '.csv', sep=''))

	# Enrollees growth through the years by Gender
	series.sum.by.gender <- aggregate(Count ~ Year + Gender, data=num.enrollees, FUN=sum)
	ggplot(series.sum.by.gender, aes(x=Year, y=Count, colour=Gender)) +
	geom_point(shape=1) +
	geom_line() +
	ylab("Number of Enrollees") +
	scale_y_continuous(labels=comma)

	# Gender differences by Region
	counts <- aggregate(Count ~ Year + Gender + Region, data=num.enrollees, FUN=sum)
	gender.diffs <- cast(counts, Year + Region ~ Gender, value="Count")
	gender.diffs$TotalStudents <- gender.diffs$Female + gender.diffs$Male
	gender.diffs$FemaleRatio <- gender.diffs$Female / gender.diffs$TotalStudents
	ggplot(gender.diffs, aes(x=Year, y=FemaleRatio, colour=Region)) +
	geom_point(shape=1) +
	geom_line() +
	ylab("Percentage of Female Enrollees") +
	scale_y_continuous(labels=comma) +
	geom_hline(aes(yintercept=0.5), colour="#990000", linetype="dashed")
	ggplot(gender.diffs, aes(x=Year, y=FemaleRatio, colour=Region)) +
	geom_point(shape=1) +
	ylab("Percentage of Female Enrollees") +
	scale_y_continuous(labels=comma) +
	geom_hline(aes(yintercept=0.5), colour="#990000", linetype="dashed")
	gender.diffs[gender.diffs$Region == "ARMM", ]
	unique(gender.diffs[gender.diffs$FemaleRatio < 0.5, ]$Region)
	length(unique(gender.diffs[gender.diffs$FemaleRatio < 0.5, ]$Region))
	gender.diffs[which.min(gender.diffs$FemaleRatio), ]

	# Enrollees growth through the years by Region
	series.sum.by.region <- aggregate(Count ~ Year + Region, data=num.enrollees, FUN=sum)
	if (which.data == 'secondary') {
	color.upper.limit <- 400000
	color.lower.limit <- 100000
	other.of.interest <- c()
	} else if (which.data == 'elementary') {
	color.upper.limit <- 1000000
	color.lower.limit <- 400000

	armm.index <- match("ARMM", levels(num.enrollees$Region))
	other.of.interest <- c(armm.index)
	}
	series.sum.by.region[series.sum.by.region$Year == 2005 &
	(series.sum.by.region$Count > color.upper.limit \|
	series.sum.by.region$Count < color.lower.limit),
	]$Region -> regions.to.color
	region.colors <- rainbow(length(levels(num.enrollees$Region)))
	line.colors <- rep("#7f7f7f", length(levels(num.enrollees$Region)))
	region.indexes.to.color <- match(regions.to.color, levels(num.enrollees$Region))
	region.indexes.to.color <- c(region.indexes.to.color, other.of.interest)
	line.colors[region.indexes.to.color] <- region.colors[region.indexes.to.color]

	ggplot(series.sum.by.region, aes(x=Year, y=Count, colour=Region)) +
	geom_point(shape=1) +
	geom_line() +
	ylab("Number of Enrollees") +
	scale_y_continuous(labels=comma) +
	scale_color_manual(values=line.colors)


	sum(num.enrollees[num.enrollees$Year == 2012, ]$Count)
	setwd("~/Dropbox/Projects/deped-enrollment")

	which.data <- 'secondary'

	input.filename.head <- 'data/deped-total-school-enrollment-for-public'
	input.filename.tail <- 'schools-2005-to-2012.csv'
	input.filename <- paste(input.filename.head, which.data, input.filename.tail, sep='-')
	data <- read.csv(input.filename, skip=7)

	# name this column
	colnames(data)[3] <- "Gender"

	# fill up rows with blank regions
	regions <- data$Region[seq(from=1, to=nrow(data), by=3)]
	data$Region[seq(from=2, to=nrow(data), by=3)] <- regions
	data$Region[seq(from=3, to=nrow(data), by=3)] <- regions

	# fill up rows with blank divisions
	divisions <- data$Division[seq(from=1, to=nrow(data), by=3)]
	data$Division[seq(from=2, to=nrow(data), by=3)] <- divisions
	data$Division[seq(from=3, to=nrow(data), by=3)] <- divisions

	# get rid of the *total rows
	data <- data[!grepl('Subtotal', data$Region),]
	data <- data[!grepl('Grand total', data$Region),]
	data <- data[!grepl('Total', data$Gender),]

	# get rid of the unnecessary special rownames column
	rownames(data) <- seq_len(nrow(data))

	## each measurement of enrollees for different years
	## should be in its own row
	## i'm thinking this could be implemeted using reshape's melt
	years <- 2005:2012
	N <- nrow(data) * length(years)
	num.enrollees <- data.frame(Region= as.factor(sapply(data$Region, function(f) rep(f, length(years)))),
	Division= as.factor(sapply(data$Division, function(f) rep(f, length(years)))),
	Gender= as.factor(sapply(data$Gender, function(f) rep(f, length(years)))),
	Year=rep(NA, N),
	Count=rep(NA, N))

	row.num <- 0
	for (i in 1:nrow(data)) {
	for (year in years) {
	row.num <- row.num + 1
	count <- data[i, paste("Enrollment", year, sep="_")]

	num.enrollees[row.num, "Year"] <- year
	num.enrollees[row.num, "Count"] <- count
	}
	}

	num.enrollees$Year <- as.factor(num.enrollees$Year)

	output.filename <- paste('data/num-enrollees-', which.data, '.csv', sep='')
	write.csv(num.enrollees, file=output.filename, row.names=FALSE)