Skip to content

Instantly share code, notes, and snippets.

@dmpe
Created November 1, 2014 23:18
Show Gist options
  • Save dmpe/b8c3f74d1e937d90b20f to your computer and use it in GitHub Desktop.
Save dmpe/b8c3f74d1e937d90b20f to your computer and use it in GitHub Desktop.
Analysis of Singapore's Education using three datasets
library("ggplot2")
source("PupilsPerTeacherRatio.R") # load our data
source("ExpenditurePerStudent_Pupils.R") # load the data
data = matrix(c(cleanedData.pupils$Years, cleanedData.pupils$ExpednInSD,
cleanedData.ratio$PupilsPerTeacher), ncol=3)
df = data.frame(data)
colnames(df) <- c("Year", "MoneySpend", "Ratio")
#TODO ggplot
pl = plot(x=df$Year, y=log(df$MoneySpend), ylim=c(2,10), main ="Red points are the ratio, which should be declining.
Whereas dark points are money spend on each student.")
points(x=df$Year, y=log(df$Ratio), col=2)
# let's load some data from Excel :_
suppressMessages(library("XLConnect"))
library("plyr")
# Number of Classes and Class Size by Level
if(!file.exists("NumberOfClassesSizeLevel.xlsx")) {
#download.file("http://data.gov.sg/Metadata/SGMatadata.aspx?id=0304010000000005778F",
# "NumberOfClassesSizeLevel.xlsx.xlsx")
}
loadedXLSofData.class = loadWorkbook("NumberOfClassesSizeLevel.xlsx")
rawTable.class = readWorksheet(loadedXLSofData.class, 1)
#Begin with just 2 columns
#select only primary "all" and secondary "all"
rawTable.class.twoCol <- rawTable.class[c("Year","Description",
"CLASSES.AND.CLASS.SIZE.BY.LEVEL...PUBLIC.SCHOOLS",
"CLASSES.AND.CLASS.SIZE.BY.LEVEL...PUBLIC.SCHOOLS.9")]
#rename column names to make it more clear for next steps
colnames(rawTable.class.twoCol) <- c("Year", "Description",
"AllNumbersForPrimarySchools", "AllNumbersForSecondarySchools")
#select row by its description and devide them into two tables
numberOfClases = subset(rawTable.class.twoCol,rawTable.class.twoCol$Description=="NO OF CLASSES")
avarageClassSize = subset(rawTable.class.twoCol,rawTable.class.twoCol$Description=="AVE CLASS SIZE")
#great; both are chars. We need to convert them
class(numberOfClases$AllNumbersForPrimarySchools)
class(numberOfClases$Year)
class(avarageClassSize$AllNumbersForSecondarySchools)
cleanedData.numberOfClases = suppressWarnings(data.frame(llply(numberOfClases, as.numeric)))
# because we apply as.numeric to chars we introduce NA (and warnings [sic!])
cleanedData.numberOfClases$Description <- NULL
cleanedData.avarageClassSize = suppressWarnings(data.frame(llply(avarageClassSize, as.numeric)))
cleanedData.avarageClassSize$Description <- NULL
# let's load some data from Excel :_
suppressMessages(library("XLConnect"))
library("plyr")
if(!file.exists("ExpenditurePerStudent_PS.xls")) {
download.file("http://data.gov.sg/Agency_Data/MOE/0303010000000005772Z.xls",
"ExpenditurePerStudent_PS.xls")
}
loadedXLSofData.pupils = loadWorkbook("ExpenditurePerStudent_PS.xls")
rawTable.pupils = readWorksheet(loadedXLSofData.pupils, 1)
rawData.pupils = rename(rawTable.pupils[3:30,0:2], c("X2.0" = "Years", "SGDATA" = "ExpednInSD"))
# summary(rawData) # DF which is a list
rawData.pupils$Years <- with(rawData.pupils,
replace(rawData.pupils$Years,
rawData.pupils$Years == "2013*", "2013"))
# NUMERIC all
cleanedData.pupils = data.frame(llply(rawData.pupils, as.numeric))
cleanedData.pupils = cleanedData.pupils[cleanedData.pupils$Years<=2012,]
summary(lm(cleanedData.pupils$ExpednInSD ~ cleanedData.pupils$Years))
cor(cleanedData.pupils$Years, cleanedData.pupils$ExpednInSD)
library("XLConnect")
library("plyr")
loadedXLSofData.ratio = loadWorkbook("PupilsPerTeacherPrimarySchools.xlsx")
rawTable.ratio = readWorksheet(loadedXLSofData.ratio, 1)
rawData.ratio = rename(rawTable.ratio[2:nrow(rawTable.ratio),], c("X0301050000000005757K" = "Years",
"Col2" = "PupilsPerTeacher"))
cleanedData.ratio = data.frame(llply(rawData.ratio, as.numeric))
cleanedData.ratio = cleanedData.ratio[cleanedData.ratio$Years>=1986,]
summary( lm(cleanedData.ratio$PupilsPerTeacher ~ cleanedData.ratio$Years))