Btibert3/NEAIR 2010.R

## gistfile2.txt

      
    Raw
  

              gistfile2.txt
            
          
## NEAIR 2010.R
#######################################################################
# Create social network from NCES 2008 migration data
# Helpful resources:
#                   R in a Nutshell -- Adler
#                   R for SAS and SPSS Users
#                   statmethods.net -- Quick R
#                   igraph.sourceforge.net
#######################################################################

# Load libraries
# if they aren't installed, simply use install.packages("igraph") for whatever package
library(igraph)
library(ggplot2)
library(gdata)
library(sqldf)
library(maps)


# Set the working directory for the data
# Notice the forward slash on windows
setwd("C:/Documents and Settings/BTIBERT/Desktop/Bentley Ideas/NEAIR/Presentation")


# Read in the datasets -- assumes they are in your working directory
mig.temp08 <- read.csv("ef2008c.csv", header=TRUE)
dir.temp08 <- read.csv("hd2008.csv", header=TRUE)
load("LatLong.Rdata") # dataset name = latlong


# use only the variables we want
# mig = unitid, efcstate (same as fips), HS <12 months enrollment
# dir = unitid, name, stateabbrev, fips, region, sector, degree granting, NCES tables, Carnegie, landgrant
mig <- mig.temp08[, c(1,2,7)]
dir <- dir.temp08[, c(1:2, 5, 7, 8, 19, 27, 41, 45, 53 )]


# select only continental US states incl. DC
mig <- mig[mig$efcstate %in% c(1, 4:6, 8:13, 16:42, 44:51, 53:56), ]
dir <- dir[dir$fips %in% c(1, 4:6, 8:13, 16:42, 44:51, 53:56), ]
latlong <- latlong[latlong$state != "AS", ]
latlong <- latlong[latlong$state != "MP", ]
latlong <- latlong[latlong$state != "PR", ]
latlong <- latlong[latlong$state != "VI", ]
latlong <- latlong[latlong$state != "HI", ]
latlong <- latlong[latlong$state != "AK", ]


# drop unused factor levels with drop.levels function in package gdata
mig <- drop.levels(mig, reorder=FALSE)
dir <- drop.levels(dir, reorder=FALSE)
latlong <- drop.levels(latlong, reorder=FALSE)


# create a lookup table to hold fips and state abbreviations
# use this to recode efcstate to labels
# uses sqldf package to perform SQL-like queries on the data sets
lookup <- sqldf("SELECT fips, stabbr FROM dir GROUP BY fips, stabbr")
names(lookup)[2] <- "res_state"
lookup <- drop.levels(lookup, reorder=FALSE)


# put all data together
# mig = 52200 observations
ds <- merge(mig, dir)
ds <- merge(ds, lookup, by.x="efcstate", by.y="fips")


# filter on Public 4-year and Private Non-Profit 4-year
ds <- ds[ds$sector %in% c(1,2), ]

# filter out NCES stat tables
ds <- ds[ds$pset4flg==1, ]

# filter out degree granting
ds <- ds[ds$deggrant ==1, ]

# filter out students who stay in-state
ds <- ds[ds$efcstate != ds$fips, ]

# filter out rows that region = 0,9
ds <- ds[ds$obereg %in% c(1:8), ]


#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Build the data frames needed to build the network graph
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

# create the edges (relationships) data frame
rel <- sqldf("SELECT res_state, stabbr, sum(efres02) FROM ds GROUP BY res_state, stabbr")
names(rel)[2] <- "school_state"
str(rel)
summary(rel)


# create the vertices (states) traits data frame
traits <- sqldf("SELECT stabbr, obereg FROM ds GROUP BY stabbr, obereg")
traits$obereg <- as.factor(as.character(traits$obereg))
traits <- merge(latlong, traits, by.x="state", by.y="stabbr")
traits <- drop.levels(traits, reorder=FALSE)
levels(traits$obereg) <- c("New England", "Mid-East", "Great Lakes", "Plains", "Southeast",
		"Southwest", "Rocky Mtns", "Far West")
str(traits)


# save out the data frames to datasets for later use
# this will create an R data file in your working directory
save(rel, traits, file="Network.Rdata")

# load the file (if you started a new session)
# Uncomment the code if you want to re-load the data from a previous session
#load("Network.Rdata")
#ls()


#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Create basic choropleth map with red/blue if import/export more students
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
names(rel)[3] <- "total"
export <- sqldf("SELECT res_state AS state, sum(total) as export FROM rel GROUP BY res_state")
import <- sqldf("SELECT school_state AS state, sum(total) as import FROM rel GROUP BY school_state")
temp <- merge(export, import)
temp$net <- temp$import - temp$export


#net migration
net.mig <- temp[order(temp$net, decreasing=F), ]
View(net.mig)
# las rotates labels (https://stat.ethz.ch/pipermail/r-help/2008-April/158652.html)
barplot(net.mig$net, horiz=T, names.arg=(net.mig$state), cex.names=.6, las=1,
		xlim=c(-30000,30000), main="Freshmen Net Migration 2008")


#load the state "map" data using a ggplot2 function
state_df <- map_data("state")
state.outline <- map_data("state")
str(state_df)

#need to use match to find the index
#NOTE:  can't just merge because will be out of order, need to use match
#adding abbrevations to the "map" dataset
#http://gist.github.com/233134
#http://www.thisisthegreenroom.com/2009/choropleths-in-r/
state_df$state <- state.abb[match(state_df$region, tolower(state.name))]
str(state_df)

#merge the data -- will lose DC....not part of state.name or state.abb
choro <- merge(state_df, temp )
str(choro)

# use the colors we want for each state depending on the test value
choro$colind <- ifelse(choro$import > choro$export,"Import More","Export More")

# not sure why I need to order, but it works!
choro <-choro[order(choro$order), ]

# create map using ggplot2
# need a second map layer to outline the states nicely
map.1 <- ggplot(choro, aes(long, lat, group=group)) + geom_polygon(aes(fill=colind))
map.1 <- map.1 + geom_path(data = state.outline, colour = "white", size = .75)
map.1


## create a map that is points and segments

# create a temp data frame that is the segments to be drawn (need two sets of lat/long)
lines <- merge(rel, latlong, by.x="res_state", by.y="state")
names(lines)[4] <- "res_lat"
names(lines)[5] <- "res_long"
lines <- merge(lines, latlong, by.x="school_state", by.y="state")
names(lines)[6] <- "schl_lat"
names(lines)[7] <- "schl_long"

# plot segments with 100 or more
lines <- lines[lines$total >= 100, ]

# create a basic flag  if 300 or more
lines$flag <- ifelse(lines$total >=300, "red", "grey")

# basic spider map
map("state")
points(traits$longitude, traits$latitude)
segments(lines$res_long, lines$res_lat, lines$schl_long, lines$schl_lat)

# spider map with segments colored on flag
map("state")
points(traits$longitude, traits$latitude)
segments(lines$res_long, lines$res_lat, lines$schl_long, lines$schl_lat, col=lines$flag)


#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Explore the relationships
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# set the rel data frame to remove all connections with zero (from sqldf I presume)
rel <- rel[rel$total >0, ]

# histogram
ggplot(rel, aes(total)) + geom_histogram()

# summary stats
length(rel$total)
summary(rel$total)
quantile(rel$total, c(.05, .25, .5, .75, .95))


str(rel)


#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Build the graph
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
g.1 <- graph.data.frame(rel, directed=TRUE, vertices=traits)
str(g.1)
summary(g.1)
plot(g.1, vertex.label=traits$state)
tkplot(g.1, vertex.label=traits$state,
		canvas.width=800,
		canvas.height=800)

V(g.1)$color="grey"
V(g.1)[name %in% c('MA', 'CT', 'RI', 'VT', 'ME', 'NH')]$color <- "red"
plot(g.1, vertex.label=traits$state,
		layout=layout.kamada.kawai)
tkplot(g.1, vertex.label=traits$state)

tkplot(g.1, vertex.label=traits$state,
		layout=layout.fruchterman.reingold)


# only keep 100 or more
rel.2 <- rel[rel[,3] >= 100, ]
str(rel.2)

# % of rows kept after filter
nrow(rel.2) / nrow(rel)


g.2 <- graph.data.frame(rel.2, directed=TRUE, vertices=traits)

V(g.2)$color="grey"
V(g.2)[name %in% c('MA', 'CT', 'RI', 'VT', 'ME', 'NH')]$color <- "red"
V(g.2)[name %in% c('NY', 'NJ', 'PA', 'MD', 'DC', 'DE')]$color <- "yellow"

# be careful - just save out as eps with generic plot name
tkplot(g.2, vertex.label=traits$state,
		canvas.width=700,
		canvas.height=700,
		layout=layout.fruchterman.reingold)

# get the coordinates from first graph so we can always reproduce the graph
# with the nodes in the same plot
lay <- tkplot.getcoords(2)

# example to reproduce the plot
g.2$layout <- lay  # sets the layout of the nodes
tkplot(g.2, vertex.label=traits$state,
		canvas.width=700,
		canvas.height=700)


#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Analysis
# http://www.drewconway.com/zia/wp-content/uploads/2009/08/sna_in_R.pdf
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

cent <- data.frame(bet=betweenness(g.2), eig=evcent(g.2)$vector, name=V(g.2)$name)
res <- lm(eig~bet, data=cent)$residuals
cent <- transform(cent, res=res)

# We use ggplot2 to make things a
# bit prettier
p<-ggplot(cent,aes(x=bet,y=eig,
						label=name,colour=res))+
		xlab("Betweenness Centrality")+
		ylab("Eigenvector Centrality")

p <- p+geom_text()+opts(title="Key Actor Analysis for 2008 Migration Dataset")
p

# Key actor plot
nodes<-V(g.2)$name
nodes[which(abs(res) < .2 )] <- NA

# List the "key actors" by the defintion of .25
key.actors <- nodes[!is.na(nodes)]
key.actors
write.csv(key.actors, "key actors.csv", row.names=F)

V(g.2)$size<-abs(res)*65
#V(g.2)$name[which(abs(res) < .2 )] <- NA
V(g.2)$color="grey"
V(g.2)[name %in% c('MA', 'CT', 'RI', 'VT', 'ME', 'NH')]$color <- "red"
V(g.2)[name %in% c('NY', 'NJ', 'PA', 'MD', 'DC', 'DE')]$color <- "yellow"
tkplot(g.2,vertex.label=nodes,
		canvas.width=700,
		canvas.height=700)


#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Cleanup the session
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
rm(list=ls())
q()
n
	#######################################################################
	# Create social network from NCES 2008 migration data
	# Helpful resources:
	# R in a Nutshell -- Adler
	# R for SAS and SPSS Users
	# statmethods.net -- Quick R
	# igraph.sourceforge.net
	#######################################################################

	# Load libraries
	# if they aren't installed, simply use install.packages("igraph") for whatever package
	library(igraph)
	library(ggplot2)
	library(gdata)
	library(sqldf)
	library(maps)


	# Set the working directory for the data
	# Notice the forward slash on windows
	setwd("C:/Documents and Settings/BTIBERT/Desktop/Bentley Ideas/NEAIR/Presentation")


	# Read in the datasets -- assumes they are in your working directory
	mig.temp08 <- read.csv("ef2008c.csv", header=TRUE)
	dir.temp08 <- read.csv("hd2008.csv", header=TRUE)
	load("LatLong.Rdata") # dataset name = latlong


	# use only the variables we want
	# mig = unitid, efcstate (same as fips), HS <12 months enrollment
	# dir = unitid, name, stateabbrev, fips, region, sector, degree granting, NCES tables, Carnegie, landgrant
	mig <- mig.temp08[, c(1,2,7)]
	dir <- dir.temp08[, c(1:2, 5, 7, 8, 19, 27, 41, 45, 53 )]


	# select only continental US states incl. DC
	mig <- mig[mig$efcstate %in% c(1, 4:6, 8:13, 16:42, 44:51, 53:56), ]
	dir <- dir[dir$fips %in% c(1, 4:6, 8:13, 16:42, 44:51, 53:56), ]
	latlong <- latlong[latlong$state != "AS", ]
	latlong <- latlong[latlong$state != "MP", ]
	latlong <- latlong[latlong$state != "PR", ]
	latlong <- latlong[latlong$state != "VI", ]
	latlong <- latlong[latlong$state != "HI", ]
	latlong <- latlong[latlong$state != "AK", ]



	# drop unused factor levels with drop.levels function in package gdata
	mig <- drop.levels(mig, reorder=FALSE)
	dir <- drop.levels(dir, reorder=FALSE)
	latlong <- drop.levels(latlong, reorder=FALSE)


	# create a lookup table to hold fips and state abbreviations
	# use this to recode efcstate to labels
	# uses sqldf package to perform SQL-like queries on the data sets
	lookup <- sqldf("SELECT fips, stabbr FROM dir GROUP BY fips, stabbr")
	names(lookup)[2] <- "res_state"
	lookup <- drop.levels(lookup, reorder=FALSE)


	# put all data together
	# mig = 52200 observations
	ds <- merge(mig, dir)
	ds <- merge(ds, lookup, by.x="efcstate", by.y="fips")


	# filter on Public 4-year and Private Non-Profit 4-year
	ds <- ds[ds$sector %in% c(1,2), ]

	# filter out NCES stat tables
	ds <- ds[ds$pset4flg==1, ]

	# filter out degree granting
	ds <- ds[ds$deggrant ==1, ]

	# filter out students who stay in-state
	ds <- ds[ds$efcstate != ds$fips, ]

	# filter out rows that region = 0,9
	ds <- ds[ds$obereg %in% c(1:8), ]


	#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	# Build the data frames needed to build the network graph
	#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

	# create the edges (relationships) data frame
	rel <- sqldf("SELECT res_state, stabbr, sum(efres02) FROM ds GROUP BY res_state, stabbr")
	names(rel)[2] <- "school_state"
	str(rel)
	summary(rel)


	# create the vertices (states) traits data frame
	traits <- sqldf("SELECT stabbr, obereg FROM ds GROUP BY stabbr, obereg")
	traits$obereg <- as.factor(as.character(traits$obereg))
	traits <- merge(latlong, traits, by.x="state", by.y="stabbr")
	traits <- drop.levels(traits, reorder=FALSE)
	levels(traits$obereg) <- c("New England", "Mid-East", "Great Lakes", "Plains", "Southeast",
	"Southwest", "Rocky Mtns", "Far West")
	str(traits)


	# save out the data frames to datasets for later use
	# this will create an R data file in your working directory
	save(rel, traits, file="Network.Rdata")

	# load the file (if you started a new session)
	# Uncomment the code if you want to re-load the data from a previous session
	#load("Network.Rdata")
	#ls()



	#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	# Create basic choropleth map with red/blue if import/export more students
	#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	names(rel)[3] <- "total"
	export <- sqldf("SELECT res_state AS state, sum(total) as export FROM rel GROUP BY res_state")
	import <- sqldf("SELECT school_state AS state, sum(total) as import FROM rel GROUP BY school_state")
	temp <- merge(export, import)
	temp$net <- temp$import - temp$export


	#net migration
	net.mig <- temp[order(temp$net, decreasing=F), ]
	View(net.mig)
	# las rotates labels (https://stat.ethz.ch/pipermail/r-help/2008-April/158652.html)
	barplot(net.mig$net, horiz=T, names.arg=(net.mig$state), cex.names=.6, las=1,
	xlim=c(-30000,30000), main="Freshmen Net Migration 2008")


	#load the state "map" data using a ggplot2 function
	state_df <- map_data("state")
	state.outline <- map_data("state")
	str(state_df)

	#need to use match to find the index
	#NOTE: can't just merge because will be out of order, need to use match
	#adding abbrevations to the "map" dataset
	#http://gist.github.com/233134
	#http://www.thisisthegreenroom.com/2009/choropleths-in-r/
	state_df$state <- state.abb[match(state_df$region, tolower(state.name))]
	str(state_df)

	#merge the data -- will lose DC....not part of state.name or state.abb
	choro <- merge(state_df, temp )
	str(choro)

	# use the colors we want for each state depending on the test value
	choro$colind <- ifelse(choro$import > choro$export,"Import More","Export More")

	# not sure why I need to order, but it works!
	choro <-choro[order(choro$order), ]

	# create map using ggplot2
	# need a second map layer to outline the states nicely
	map.1 <- ggplot(choro, aes(long, lat, group=group)) + geom_polygon(aes(fill=colind))
	map.1 <- map.1 + geom_path(data = state.outline, colour = "white", size = .75)
	map.1


	## create a map that is points and segments

	# create a temp data frame that is the segments to be drawn (need two sets of lat/long)
	lines <- merge(rel, latlong, by.x="res_state", by.y="state")
	names(lines)[4] <- "res_lat"
	names(lines)[5] <- "res_long"
	lines <- merge(lines, latlong, by.x="school_state", by.y="state")
	names(lines)[6] <- "schl_lat"
	names(lines)[7] <- "schl_long"

	# plot segments with 100 or more
	lines <- lines[lines$total >= 100, ]

	# create a basic flag if 300 or more
	lines$flag <- ifelse(lines$total >=300, "red", "grey")

	# basic spider map
	map("state")
	points(traits$longitude, traits$latitude)
	segments(lines$res_long, lines$res_lat, lines$schl_long, lines$schl_lat)

	# spider map with segments colored on flag
	map("state")
	points(traits$longitude, traits$latitude)
	segments(lines$res_long, lines$res_lat, lines$schl_long, lines$schl_lat, col=lines$flag)




	#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	# Explore the relationships
	#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	# set the rel data frame to remove all connections with zero (from sqldf I presume)
	rel <- rel[rel$total >0, ]

	# histogram
	ggplot(rel, aes(total)) + geom_histogram()

	# summary stats
	length(rel$total)
	summary(rel$total)
	quantile(rel$total, c(.05, .25, .5, .75, .95))


	str(rel)


	#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	# Build the graph
	#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	g.1 <- graph.data.frame(rel, directed=TRUE, vertices=traits)
	str(g.1)
	summary(g.1)
	plot(g.1, vertex.label=traits$state)
	tkplot(g.1, vertex.label=traits$state,
	canvas.width=800,
	canvas.height=800)

	V(g.1)$color="grey"
	V(g.1)[name %in% c('MA', 'CT', 'RI', 'VT', 'ME', 'NH')]$color <- "red"
	plot(g.1, vertex.label=traits$state,
	layout=layout.kamada.kawai)
	tkplot(g.1, vertex.label=traits$state)

	tkplot(g.1, vertex.label=traits$state,
	layout=layout.fruchterman.reingold)


	# only keep 100 or more
	rel.2 <- rel[rel[,3] >= 100, ]
	str(rel.2)

	# % of rows kept after filter
	nrow(rel.2) / nrow(rel)


	g.2 <- graph.data.frame(rel.2, directed=TRUE, vertices=traits)

	V(g.2)$color="grey"
	V(g.2)[name %in% c('MA', 'CT', 'RI', 'VT', 'ME', 'NH')]$color <- "red"
	V(g.2)[name %in% c('NY', 'NJ', 'PA', 'MD', 'DC', 'DE')]$color <- "yellow"

	# be careful - just save out as eps with generic plot name
	tkplot(g.2, vertex.label=traits$state,
	canvas.width=700,
	canvas.height=700,
	layout=layout.fruchterman.reingold)

	# get the coordinates from first graph so we can always reproduce the graph
	# with the nodes in the same plot
	lay <- tkplot.getcoords(2)

	# example to reproduce the plot
	g.2$layout <- lay # sets the layout of the nodes
	tkplot(g.2, vertex.label=traits$state,
	canvas.width=700,
	canvas.height=700)


	#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	# Analysis
	# http://www.drewconway.com/zia/wp-content/uploads/2009/08/sna_in_R.pdf
	#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

	cent <- data.frame(bet=betweenness(g.2), eig=evcent(g.2)$vector, name=V(g.2)$name)
	res <- lm(eig~bet, data=cent)$residuals
	cent <- transform(cent, res=res)

	# We use ggplot2 to make things a
	# bit prettier
	p<-ggplot(cent,aes(x=bet,y=eig,
	label=name,colour=res))+
	xlab("Betweenness Centrality")+
	ylab("Eigenvector Centrality")

	p <- p+geom_text()+opts(title="Key Actor Analysis for 2008 Migration Dataset")
	p

	# Key actor plot
	nodes<-V(g.2)$name
	nodes[which(abs(res) < .2 )] <- NA

	# List the "key actors" by the defintion of .25
	key.actors <- nodes[!is.na(nodes)]
	key.actors
	write.csv(key.actors, "key actors.csv", row.names=F)

	V(g.2)$size<-abs(res)*65
	#V(g.2)$name[which(abs(res) < .2 )] <- NA
	V(g.2)$color="grey"
	V(g.2)[name %in% c('MA', 'CT', 'RI', 'VT', 'ME', 'NH')]$color <- "red"
	V(g.2)[name %in% c('NY', 'NJ', 'PA', 'MD', 'DC', 'DE')]$color <- "yellow"
	tkplot(g.2,vertex.label=nodes,
	canvas.width=700,
	canvas.height=700)


	#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	# Cleanup the session
	#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	rm(list=ls())
	q()
	n