JoFrhwld/phila.schools.R

## phila.schools.R
library(rgdal) #This may be a pain to get installed. The OGR functions come from it.
## install gdal, available here: http://www.gdal.org/
## install proj.4, available here: http://trac.osgeo.org/proj/
## For some reason, R wasn't expecting for libproj to be installed where it was,
## so I had to tell it to look at /usr/local/bin

library(ggplot2)
library(reshape2)
library(plyr)
gpclibPermit()

## http://www.opendataphilly.org/opendata/resource/38/schools/
setwd("~/PhiladelphiaSchools201201/Philadelphia Schools/")
#ogrListLayers(dsn = ".")
# "PhiladelphiaSchools201201"

schools.points <- readOGR(dsn = ".", layer = "PhiladelphiaSchools201201")
schools.points <- spTransform(schools.points, CRS("+proj=longlat"))
schools.df <- cbind(schools.points@coords, schools.points@data)
colnames(schools.df)[1:2] <- c("long","lat")

## This crazy subset was all about making my life easier to merge the data using string matching.
hschools.df <- subset(schools.df,
						(grepl("-12", GRADE_ORG)|GRADE_LEVE == "High School") &
						FACIL_TYPE == "School" &
						ACTIVE == "y" &
						INSTIT_TYP %in% c("District", "Charter"))


## http://www.pde.state.pa.us/portal/server.pt/community/graduates/7426
grad <- read.delim("~/graduation.txt")

## I had to create and modify these grep strings by hand,
## because there was no shared ID between the two data sets.
data.frame(
search = c("Palumbo", "ARISE",
"Bartram.*John", "Bodine", "Bok",
"Carroll", "Carver", "Central", "Architecture",
"Communications Tech", "Community Academy",
"Constitution", "Creative", "Delaware",
"Dobbins", "Douglas", "Edison",
"Esperanza", "Fels", "FitzSimons",
"Frankford", "Benjamin", "Franklin.*L", "Towne",
"Freire", "Furness", "Girard Academic", "Germantown", "Girls",
"Gratz", "Hope", "Imhotep", "Kensington C[^u]",
"Kensington Cu", "Kensington I",
"Martin Luther", "Lamberton", "Lankenau",
"Lincoln", "Bracetti", "Maritime",
"Math, Science.*Community", "Mastbaum", "Masterman",
"Mastery Charter", "Thomas.*Mastery",
"Civics", "Motivation", "Multi-Cultural",
"New Media", "Northeast", "Olney", "Olney",
"Overbrook", "Parkway Northwest", "Parkway West", "Parkway.*Center City",
"Robeson", "Penn.*William", "Philadelphia Academy",
"Philadelphia Electrical", "Phila.*Business",
"Learning.*N", "Learning.*S",
"Military.*Elverson", "Military.*Leeds",
"Prep.*Math.*Science", "Randolph",
"Rhodes", "Roxborough", "Saul",
"Sayre.*W", "Future", "Science.*Leader",
"South Phila", "Strawberry Mansion", "Swenson",
"University City", "Vaux", "Washington.*G",
"West Philadelphia", "Widener", "World Communications",
"YouthBuild"),
School = levels(grad$School))->school.search

## Sanity checking the string matching.
found.list <- dlply(school.search, .(search, School),
				function(df1, df2){
				out <- df2[grep(df1$search, df2$FACIL_NAME),]
				return(out)
				},
				df2 = hschools.df)
found.n <- ldply(found.list, nrow)


## The for real merged data.
found.df <- ddply(school.search, .(search, School),
				function(df1, df2){
				out <- df2[grep(df1$search, df2$FACIL_NAME),]
				return(out)
				},
				df2 = hschools.df)

found.df <- join(found.df, grad)

## Load zip code shapefile
## http://www.opendataphilly.org/opendata/resource/44/zip-codes/
setwd("~/phila-city_limits_shp/")
city_limits.shp <- readOGR(dsn = ".", layer = "city_limits")
city_limits.shp <- spTransform(city_limits.shp, CRS("+proj=longlat"))
city_limits.shp@data$id <- row.names(city_limits.shp@data)
city_limits.df <- fortify(city_limits.shp, region = "id")

## Load city limits shapefile
## http://www.opendataphilly.org/opendata/resource/8/city-limits/
setwd("~/phila-zipcodes_shp/")
zips.shp <- readOGR(dsn = ".", layer = "zipcodes")
zips.shp <- spTransform(zips.shp, CRS("+proj=longlat"))
zips.shp@data$id <- row.names(zips.shp@data)
zips.df <- fortify(zips.shp, region = "id")
zips.df <- join(zips.df, zips.shp@data)

## Generate map of all data.
ggplot(found.df, aes(long, lat))+
	geom_polygon(data = zips.df, aes(group = group), fill = NA, color = "grey70", size = 0.2)+	geom_polygon(data = city_limits.df, aes(group = group), fill = NA, color = "black")+
	geom_point(aes(size = Graduates, fill = Postsecondary/Graduates, shape = INSTIT_TYP))+
	scale_shape_manual(name = "School Type",values = c(24, 21))+
	scale_fill_gradient2(high = "darkred",low = "darkblue", mid = "grey90", midpoint = 0.5, name = "Postsecondary")+
	guides(fill = "colorbar")+
	scale_area()+
	theme_bw()+
	coord_map()+
	opts(panel.grid.major = theme_blank())

## Map of the higher mode of data
ggplot(subset(found.df, Postsecondary/Graduates > 0.05), aes(long, lat))+
	geom_polygon(data = zips.df, aes(group = group), fill = NA, color = "grey70", size = 0.2)+	geom_polygon(data = city_limits.df, aes(group = group), fill = NA, color = "black")+
	geom_point(aes(size = Graduates, fill = Postsecondary/Graduates, shape = INSTIT_TYP))+
	scale_shape_manual(name = "School Type",values = c(24, 21))+
	scale_fill_gradient2(high = "darkred",low = "darkblue", mid = "grey90", midpoint = 0.75, name = "Postsecondary", limits = c(0.5,1))+
	guides(fill = "colorbar")+
	scale_area()+
	theme_bw()+
	coord_map()+
	opts(panel.grid.major = theme_blank())


## Dotplot
ggplot(found.df, aes(Postsecondary/Graduates)) +
		geom_density(aes(y = ..count..)) +
		geom_dotplot() +
		facet_wrap(~INSTIT_TYP)+
		theme_bw()+
		scale_y_continuous(name = "", breaks = NULL)

## Dotplot, excluding Postsecondary == 0 data
ggplot(subset(found.df, Postsecondary > 0), aes(Postsecondary/Graduates)) +
		geom_density(aes(y = ..count..)) +
		geom_dotplot() +
		facet_wrap(~INSTIT_TYP)+
		theme_bw()+
		scale_y_continuous(name = "", breaks = NULL)
	library(rgdal) #This may be a pain to get installed. The OGR functions come from it.
	## install gdal, available here: http://www.gdal.org/
	## install proj.4, available here: http://trac.osgeo.org/proj/
	## For some reason, R wasn't expecting for libproj to be installed where it was,
	## so I had to tell it to look at /usr/local/bin

	library(ggplot2)
	library(reshape2)
	library(plyr)
	gpclibPermit()

	## http://www.opendataphilly.org/opendata/resource/38/schools/
	setwd("~/PhiladelphiaSchools201201/Philadelphia Schools/")
	#ogrListLayers(dsn = ".")
	# "PhiladelphiaSchools201201"

	schools.points <- readOGR(dsn = ".", layer = "PhiladelphiaSchools201201")
	schools.points <- spTransform(schools.points, CRS("+proj=longlat"))
	schools.df <- cbind(schools.points@coords, schools.points@data)
	colnames(schools.df)[1:2] <- c("long","lat")

	## This crazy subset was all about making my life easier to merge the data using string matching.
	hschools.df <- subset(schools.df,
	(grepl("-12", GRADE_ORG)\|GRADE_LEVE == "High School") &
	FACIL_TYPE == "School" &
	ACTIVE == "y" &
	INSTIT_TYP %in% c("District", "Charter"))


	## http://www.pde.state.pa.us/portal/server.pt/community/graduates/7426
	grad <- read.delim("~/graduation.txt")

	## I had to create and modify these grep strings by hand,
	## because there was no shared ID between the two data sets.
	data.frame(
	search = c("Palumbo", "ARISE",
	"Bartram.*John", "Bodine", "Bok",
	"Carroll", "Carver", "Central", "Architecture",
	"Communications Tech", "Community Academy",
	"Constitution", "Creative", "Delaware",
	"Dobbins", "Douglas", "Edison",
	"Esperanza", "Fels", "FitzSimons",
	"Frankford", "Benjamin", "Franklin.*L", "Towne",
	"Freire", "Furness", "Girard Academic", "Germantown", "Girls",
	"Gratz", "Hope", "Imhotep", "Kensington C[^u]",
	"Kensington Cu", "Kensington I",
	"Martin Luther", "Lamberton", "Lankenau",
	"Lincoln", "Bracetti", "Maritime",
	"Math, Science.*Community", "Mastbaum", "Masterman",
	"Mastery Charter", "Thomas.*Mastery",
	"Civics", "Motivation", "Multi-Cultural",
	"New Media", "Northeast", "Olney", "Olney",
	"Overbrook", "Parkway Northwest", "Parkway West", "Parkway.*Center City",
	"Robeson", "Penn.*William", "Philadelphia Academy",
	"Philadelphia Electrical", "Phila.*Business",
	"Learning.N", "Learning.S",
	"Military.Elverson", "Military.Leeds",
	"Prep.Math.Science", "Randolph",
	"Rhodes", "Roxborough", "Saul",
	"Sayre.W", "Future", "Science.Leader",
	"South Phila", "Strawberry Mansion", "Swenson",
	"University City", "Vaux", "Washington.*G",
	"West Philadelphia", "Widener", "World Communications",
	"YouthBuild"),
	School = levels(grad$School))->school.search

	## Sanity checking the string matching.
	found.list <- dlply(school.search, .(search, School),
	function(df1, df2){
	out <- df2[grep(df1$search, df2$FACIL_NAME),]
	return(out)
	},
	df2 = hschools.df)
	found.n <- ldply(found.list, nrow)


	## The for real merged data.
	found.df <- ddply(school.search, .(search, School),
	function(df1, df2){
	out <- df2[grep(df1$search, df2$FACIL_NAME),]
	return(out)
	},
	df2 = hschools.df)

	found.df <- join(found.df, grad)

	## Load zip code shapefile
	## http://www.opendataphilly.org/opendata/resource/44/zip-codes/
	setwd("~/phila-city_limits_shp/")
	city_limits.shp <- readOGR(dsn = ".", layer = "city_limits")
	city_limits.shp <- spTransform(city_limits.shp, CRS("+proj=longlat"))
	city_limits.shp@data$id <- row.names(city_limits.shp@data)
	city_limits.df <- fortify(city_limits.shp, region = "id")

	## Load city limits shapefile
	## http://www.opendataphilly.org/opendata/resource/8/city-limits/
	setwd("~/phila-zipcodes_shp/")
	zips.shp <- readOGR(dsn = ".", layer = "zipcodes")
	zips.shp <- spTransform(zips.shp, CRS("+proj=longlat"))
	zips.shp@data$id <- row.names(zips.shp@data)
	zips.df <- fortify(zips.shp, region = "id")
	zips.df <- join(zips.df, zips.shp@data)

	## Generate map of all data.
	ggplot(found.df, aes(long, lat))+
	geom_polygon(data = zips.df, aes(group = group), fill = NA, color = "grey70", size = 0.2)+ geom_polygon(data = city_limits.df, aes(group = group), fill = NA, color = "black")+
	geom_point(aes(size = Graduates, fill = Postsecondary/Graduates, shape = INSTIT_TYP))+
	scale_shape_manual(name = "School Type",values = c(24, 21))+
	scale_fill_gradient2(high = "darkred",low = "darkblue", mid = "grey90", midpoint = 0.5, name = "Postsecondary")+
	guides(fill = "colorbar")+
	scale_area()+
	theme_bw()+
	coord_map()+
	opts(panel.grid.major = theme_blank())

	## Map of the higher mode of data
	ggplot(subset(found.df, Postsecondary/Graduates > 0.05), aes(long, lat))+
	geom_polygon(data = zips.df, aes(group = group), fill = NA, color = "grey70", size = 0.2)+ geom_polygon(data = city_limits.df, aes(group = group), fill = NA, color = "black")+
	geom_point(aes(size = Graduates, fill = Postsecondary/Graduates, shape = INSTIT_TYP))+
	scale_shape_manual(name = "School Type",values = c(24, 21))+
	scale_fill_gradient2(high = "darkred",low = "darkblue", mid = "grey90", midpoint = 0.75, name = "Postsecondary", limits = c(0.5,1))+
	guides(fill = "colorbar")+
	scale_area()+
	theme_bw()+
	coord_map()+
	opts(panel.grid.major = theme_blank())


	## Dotplot
	ggplot(found.df, aes(Postsecondary/Graduates)) +
	geom_density(aes(y = ..count..)) +
	geom_dotplot() +
	facet_wrap(~INSTIT_TYP)+
	theme_bw()+
	scale_y_continuous(name = "", breaks = NULL)

	## Dotplot, excluding Postsecondary == 0 data
	ggplot(subset(found.df, Postsecondary > 0), aes(Postsecondary/Graduates)) +
	geom_density(aes(y = ..count..)) +
	geom_dotplot() +
	facet_wrap(~INSTIT_TYP)+
	theme_bw()+
	scale_y_continuous(name = "", breaks = NULL)