Created
November 22, 2010 20:19
-
-
Save Btibert3/710595 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
####################################################################### | |
# Create social network from NCES 2008 migration data | |
# Helpful resources: | |
# R in a Nutshell -- Adler | |
# R for SAS and SPSS Users | |
# statmethods.net -- Quick R | |
# igraph.sourceforge.net | |
####################################################################### | |
# Load libraries | |
# if they aren't installed, simply use install.packages("igraph") for whatever package | |
library(igraph) | |
library(ggplot2) | |
library(gdata) | |
library(sqldf) | |
library(maps) | |
# Set the working directory for the data | |
# Notice the forward slash on windows | |
setwd("C:/Documents and Settings/BTIBERT/Desktop/Bentley Ideas/NEAIR/Presentation") | |
# Read in the datasets -- assumes they are in your working directory | |
mig.temp08 <- read.csv("ef2008c.csv", header=TRUE) | |
dir.temp08 <- read.csv("hd2008.csv", header=TRUE) | |
load("LatLong.Rdata") # dataset name = latlong | |
# use only the variables we want | |
# mig = unitid, efcstate (same as fips), HS <12 months enrollment | |
# dir = unitid, name, stateabbrev, fips, region, sector, degree granting, NCES tables, Carnegie, landgrant | |
mig <- mig.temp08[, c(1,2,7)] | |
dir <- dir.temp08[, c(1:2, 5, 7, 8, 19, 27, 41, 45, 53 )] | |
# select only continental US states incl. DC | |
mig <- mig[mig$efcstate %in% c(1, 4:6, 8:13, 16:42, 44:51, 53:56), ] | |
dir <- dir[dir$fips %in% c(1, 4:6, 8:13, 16:42, 44:51, 53:56), ] | |
latlong <- latlong[latlong$state != "AS", ] | |
latlong <- latlong[latlong$state != "MP", ] | |
latlong <- latlong[latlong$state != "PR", ] | |
latlong <- latlong[latlong$state != "VI", ] | |
latlong <- latlong[latlong$state != "HI", ] | |
latlong <- latlong[latlong$state != "AK", ] | |
# drop unused factor levels with drop.levels function in package gdata | |
mig <- drop.levels(mig, reorder=FALSE) | |
dir <- drop.levels(dir, reorder=FALSE) | |
latlong <- drop.levels(latlong, reorder=FALSE) | |
# create a lookup table to hold fips and state abbreviations | |
# use this to recode efcstate to labels | |
# uses sqldf package to perform SQL-like queries on the data sets | |
lookup <- sqldf("SELECT fips, stabbr FROM dir GROUP BY fips, stabbr") | |
names(lookup)[2] <- "res_state" | |
lookup <- drop.levels(lookup, reorder=FALSE) | |
# put all data together | |
# mig = 52200 observations | |
ds <- merge(mig, dir) | |
ds <- merge(ds, lookup, by.x="efcstate", by.y="fips") | |
# filter on Public 4-year and Private Non-Profit 4-year | |
ds <- ds[ds$sector %in% c(1,2), ] | |
# filter out NCES stat tables | |
ds <- ds[ds$pset4flg==1, ] | |
# filter out degree granting | |
ds <- ds[ds$deggrant ==1, ] | |
# filter out students who stay in-state | |
ds <- ds[ds$efcstate != ds$fips, ] | |
# filter out rows that region = 0,9 | |
ds <- ds[ds$obereg %in% c(1:8), ] | |
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
# Build the data frames needed to build the network graph | |
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
# create the edges (relationships) data frame | |
rel <- sqldf("SELECT res_state, stabbr, sum(efres02) FROM ds GROUP BY res_state, stabbr") | |
names(rel)[2] <- "school_state" | |
str(rel) | |
summary(rel) | |
# create the vertices (states) traits data frame | |
traits <- sqldf("SELECT stabbr, obereg FROM ds GROUP BY stabbr, obereg") | |
traits$obereg <- as.factor(as.character(traits$obereg)) | |
traits <- merge(latlong, traits, by.x="state", by.y="stabbr") | |
traits <- drop.levels(traits, reorder=FALSE) | |
levels(traits$obereg) <- c("New England", "Mid-East", "Great Lakes", "Plains", "Southeast", | |
"Southwest", "Rocky Mtns", "Far West") | |
str(traits) | |
# save out the data frames to datasets for later use | |
# this will create an R data file in your working directory | |
save(rel, traits, file="Network.Rdata") | |
# load the file (if you started a new session) | |
# Uncomment the code if you want to re-load the data from a previous session | |
#load("Network.Rdata") | |
#ls() | |
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
# Create basic choropleth map with red/blue if import/export more students | |
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
names(rel)[3] <- "total" | |
export <- sqldf("SELECT res_state AS state, sum(total) as export FROM rel GROUP BY res_state") | |
import <- sqldf("SELECT school_state AS state, sum(total) as import FROM rel GROUP BY school_state") | |
temp <- merge(export, import) | |
temp$net <- temp$import - temp$export | |
#net migration | |
net.mig <- temp[order(temp$net, decreasing=F), ] | |
View(net.mig) | |
# las rotates labels (https://stat.ethz.ch/pipermail/r-help/2008-April/158652.html) | |
barplot(net.mig$net, horiz=T, names.arg=(net.mig$state), cex.names=.6, las=1, | |
xlim=c(-30000,30000), main="Freshmen Net Migration 2008") | |
#load the state "map" data using a ggplot2 function | |
state_df <- map_data("state") | |
state.outline <- map_data("state") | |
str(state_df) | |
#need to use match to find the index | |
#NOTE: can't just merge because will be out of order, need to use match | |
#adding abbrevations to the "map" dataset | |
#http://gist.github.com/233134 | |
#http://www.thisisthegreenroom.com/2009/choropleths-in-r/ | |
state_df$state <- state.abb[match(state_df$region, tolower(state.name))] | |
str(state_df) | |
#merge the data -- will lose DC....not part of state.name or state.abb | |
choro <- merge(state_df, temp ) | |
str(choro) | |
# use the colors we want for each state depending on the test value | |
choro$colind <- ifelse(choro$import > choro$export,"Import More","Export More") | |
# not sure why I need to order, but it works! | |
choro <-choro[order(choro$order), ] | |
# create map using ggplot2 | |
# need a second map layer to outline the states nicely | |
map.1 <- ggplot(choro, aes(long, lat, group=group)) + geom_polygon(aes(fill=colind)) | |
map.1 <- map.1 + geom_path(data = state.outline, colour = "white", size = .75) | |
map.1 | |
## create a map that is points and segments | |
# create a temp data frame that is the segments to be drawn (need two sets of lat/long) | |
lines <- merge(rel, latlong, by.x="res_state", by.y="state") | |
names(lines)[4] <- "res_lat" | |
names(lines)[5] <- "res_long" | |
lines <- merge(lines, latlong, by.x="school_state", by.y="state") | |
names(lines)[6] <- "schl_lat" | |
names(lines)[7] <- "schl_long" | |
# plot segments with 100 or more | |
lines <- lines[lines$total >= 100, ] | |
# create a basic flag if 300 or more | |
lines$flag <- ifelse(lines$total >=300, "red", "grey") | |
# basic spider map | |
map("state") | |
points(traits$longitude, traits$latitude) | |
segments(lines$res_long, lines$res_lat, lines$schl_long, lines$schl_lat) | |
# spider map with segments colored on flag | |
map("state") | |
points(traits$longitude, traits$latitude) | |
segments(lines$res_long, lines$res_lat, lines$schl_long, lines$schl_lat, col=lines$flag) | |
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
# Explore the relationships | |
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
# set the rel data frame to remove all connections with zero (from sqldf I presume) | |
rel <- rel[rel$total >0, ] | |
# histogram | |
ggplot(rel, aes(total)) + geom_histogram() | |
# summary stats | |
length(rel$total) | |
summary(rel$total) | |
quantile(rel$total, c(.05, .25, .5, .75, .95)) | |
str(rel) | |
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
# Build the graph | |
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
g.1 <- graph.data.frame(rel, directed=TRUE, vertices=traits) | |
str(g.1) | |
summary(g.1) | |
plot(g.1, vertex.label=traits$state) | |
tkplot(g.1, vertex.label=traits$state, | |
canvas.width=800, | |
canvas.height=800) | |
V(g.1)$color="grey" | |
V(g.1)[name %in% c('MA', 'CT', 'RI', 'VT', 'ME', 'NH')]$color <- "red" | |
plot(g.1, vertex.label=traits$state, | |
layout=layout.kamada.kawai) | |
tkplot(g.1, vertex.label=traits$state) | |
tkplot(g.1, vertex.label=traits$state, | |
layout=layout.fruchterman.reingold) | |
# only keep 100 or more | |
rel.2 <- rel[rel[,3] >= 100, ] | |
str(rel.2) | |
# % of rows kept after filter | |
nrow(rel.2) / nrow(rel) | |
g.2 <- graph.data.frame(rel.2, directed=TRUE, vertices=traits) | |
V(g.2)$color="grey" | |
V(g.2)[name %in% c('MA', 'CT', 'RI', 'VT', 'ME', 'NH')]$color <- "red" | |
V(g.2)[name %in% c('NY', 'NJ', 'PA', 'MD', 'DC', 'DE')]$color <- "yellow" | |
# be careful - just save out as eps with generic plot name | |
tkplot(g.2, vertex.label=traits$state, | |
canvas.width=700, | |
canvas.height=700, | |
layout=layout.fruchterman.reingold) | |
# get the coordinates from first graph so we can always reproduce the graph | |
# with the nodes in the same plot | |
lay <- tkplot.getcoords(2) | |
# example to reproduce the plot | |
g.2$layout <- lay # sets the layout of the nodes | |
tkplot(g.2, vertex.label=traits$state, | |
canvas.width=700, | |
canvas.height=700) | |
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
# Analysis | |
# http://www.drewconway.com/zia/wp-content/uploads/2009/08/sna_in_R.pdf | |
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
cent <- data.frame(bet=betweenness(g.2), eig=evcent(g.2)$vector, name=V(g.2)$name) | |
res <- lm(eig~bet, data=cent)$residuals | |
cent <- transform(cent, res=res) | |
# We use ggplot2 to make things a | |
# bit prettier | |
p<-ggplot(cent,aes(x=bet,y=eig, | |
label=name,colour=res))+ | |
xlab("Betweenness Centrality")+ | |
ylab("Eigenvector Centrality") | |
p <- p+geom_text()+opts(title="Key Actor Analysis for 2008 Migration Dataset") | |
p | |
# Key actor plot | |
nodes<-V(g.2)$name | |
nodes[which(abs(res) < .2 )] <- NA | |
# List the "key actors" by the defintion of .25 | |
key.actors <- nodes[!is.na(nodes)] | |
key.actors | |
write.csv(key.actors, "key actors.csv", row.names=F) | |
V(g.2)$size<-abs(res)*65 | |
#V(g.2)$name[which(abs(res) < .2 )] <- NA | |
V(g.2)$color="grey" | |
V(g.2)[name %in% c('MA', 'CT', 'RI', 'VT', 'ME', 'NH')]$color <- "red" | |
V(g.2)[name %in% c('NY', 'NJ', 'PA', 'MD', 'DC', 'DE')]$color <- "yellow" | |
tkplot(g.2,vertex.label=nodes, | |
canvas.width=700, | |
canvas.height=700) | |
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
# Cleanup the session | |
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
rm(list=ls()) | |
q() | |
n |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment