Skip to content

Instantly share code, notes, and snippets.

@Btibert3
Created November 22, 2010 20:19
Show Gist options
  • Save Btibert3/710595 to your computer and use it in GitHub Desktop.
Save Btibert3/710595 to your computer and use it in GitHub Desktop.
#######################################################################
# Create social network from NCES 2008 migration data
# Helpful resources:
# R in a Nutshell -- Adler
# R for SAS and SPSS Users
# statmethods.net -- Quick R
# igraph.sourceforge.net
#######################################################################
# Load libraries
# if they aren't installed, simply use install.packages("igraph") for whatever package
library(igraph)
library(ggplot2)
library(gdata)
library(sqldf)
library(maps)
# Set the working directory for the data
# Notice the forward slash on windows
setwd("C:/Documents and Settings/BTIBERT/Desktop/Bentley Ideas/NEAIR/Presentation")
# Read in the datasets -- assumes they are in your working directory
mig.temp08 <- read.csv("ef2008c.csv", header=TRUE)
dir.temp08 <- read.csv("hd2008.csv", header=TRUE)
load("LatLong.Rdata") # dataset name = latlong
# use only the variables we want
# mig = unitid, efcstate (same as fips), HS <12 months enrollment
# dir = unitid, name, stateabbrev, fips, region, sector, degree granting, NCES tables, Carnegie, landgrant
mig <- mig.temp08[, c(1,2,7)]
dir <- dir.temp08[, c(1:2, 5, 7, 8, 19, 27, 41, 45, 53 )]
# select only continental US states incl. DC
mig <- mig[mig$efcstate %in% c(1, 4:6, 8:13, 16:42, 44:51, 53:56), ]
dir <- dir[dir$fips %in% c(1, 4:6, 8:13, 16:42, 44:51, 53:56), ]
latlong <- latlong[latlong$state != "AS", ]
latlong <- latlong[latlong$state != "MP", ]
latlong <- latlong[latlong$state != "PR", ]
latlong <- latlong[latlong$state != "VI", ]
latlong <- latlong[latlong$state != "HI", ]
latlong <- latlong[latlong$state != "AK", ]
# drop unused factor levels with drop.levels function in package gdata
mig <- drop.levels(mig, reorder=FALSE)
dir <- drop.levels(dir, reorder=FALSE)
latlong <- drop.levels(latlong, reorder=FALSE)
# create a lookup table to hold fips and state abbreviations
# use this to recode efcstate to labels
# uses sqldf package to perform SQL-like queries on the data sets
lookup <- sqldf("SELECT fips, stabbr FROM dir GROUP BY fips, stabbr")
names(lookup)[2] <- "res_state"
lookup <- drop.levels(lookup, reorder=FALSE)
# put all data together
# mig = 52200 observations
ds <- merge(mig, dir)
ds <- merge(ds, lookup, by.x="efcstate", by.y="fips")
# filter on Public 4-year and Private Non-Profit 4-year
ds <- ds[ds$sector %in% c(1,2), ]
# filter out NCES stat tables
ds <- ds[ds$pset4flg==1, ]
# filter out degree granting
ds <- ds[ds$deggrant ==1, ]
# filter out students who stay in-state
ds <- ds[ds$efcstate != ds$fips, ]
# filter out rows that region = 0,9
ds <- ds[ds$obereg %in% c(1:8), ]
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Build the data frames needed to build the network graph
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# create the edges (relationships) data frame
rel <- sqldf("SELECT res_state, stabbr, sum(efres02) FROM ds GROUP BY res_state, stabbr")
names(rel)[2] <- "school_state"
str(rel)
summary(rel)
# create the vertices (states) traits data frame
traits <- sqldf("SELECT stabbr, obereg FROM ds GROUP BY stabbr, obereg")
traits$obereg <- as.factor(as.character(traits$obereg))
traits <- merge(latlong, traits, by.x="state", by.y="stabbr")
traits <- drop.levels(traits, reorder=FALSE)
levels(traits$obereg) <- c("New England", "Mid-East", "Great Lakes", "Plains", "Southeast",
"Southwest", "Rocky Mtns", "Far West")
str(traits)
# save out the data frames to datasets for later use
# this will create an R data file in your working directory
save(rel, traits, file="Network.Rdata")
# load the file (if you started a new session)
# Uncomment the code if you want to re-load the data from a previous session
#load("Network.Rdata")
#ls()
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Create basic choropleth map with red/blue if import/export more students
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
names(rel)[3] <- "total"
export <- sqldf("SELECT res_state AS state, sum(total) as export FROM rel GROUP BY res_state")
import <- sqldf("SELECT school_state AS state, sum(total) as import FROM rel GROUP BY school_state")
temp <- merge(export, import)
temp$net <- temp$import - temp$export
#net migration
net.mig <- temp[order(temp$net, decreasing=F), ]
View(net.mig)
# las rotates labels (https://stat.ethz.ch/pipermail/r-help/2008-April/158652.html)
barplot(net.mig$net, horiz=T, names.arg=(net.mig$state), cex.names=.6, las=1,
xlim=c(-30000,30000), main="Freshmen Net Migration 2008")
#load the state "map" data using a ggplot2 function
state_df <- map_data("state")
state.outline <- map_data("state")
str(state_df)
#need to use match to find the index
#NOTE: can't just merge because will be out of order, need to use match
#adding abbrevations to the "map" dataset
#http://gist.github.com/233134
#http://www.thisisthegreenroom.com/2009/choropleths-in-r/
state_df$state <- state.abb[match(state_df$region, tolower(state.name))]
str(state_df)
#merge the data -- will lose DC....not part of state.name or state.abb
choro <- merge(state_df, temp )
str(choro)
# use the colors we want for each state depending on the test value
choro$colind <- ifelse(choro$import > choro$export,"Import More","Export More")
# not sure why I need to order, but it works!
choro <-choro[order(choro$order), ]
# create map using ggplot2
# need a second map layer to outline the states nicely
map.1 <- ggplot(choro, aes(long, lat, group=group)) + geom_polygon(aes(fill=colind))
map.1 <- map.1 + geom_path(data = state.outline, colour = "white", size = .75)
map.1
## create a map that is points and segments
# create a temp data frame that is the segments to be drawn (need two sets of lat/long)
lines <- merge(rel, latlong, by.x="res_state", by.y="state")
names(lines)[4] <- "res_lat"
names(lines)[5] <- "res_long"
lines <- merge(lines, latlong, by.x="school_state", by.y="state")
names(lines)[6] <- "schl_lat"
names(lines)[7] <- "schl_long"
# plot segments with 100 or more
lines <- lines[lines$total >= 100, ]
# create a basic flag if 300 or more
lines$flag <- ifelse(lines$total >=300, "red", "grey")
# basic spider map
map("state")
points(traits$longitude, traits$latitude)
segments(lines$res_long, lines$res_lat, lines$schl_long, lines$schl_lat)
# spider map with segments colored on flag
map("state")
points(traits$longitude, traits$latitude)
segments(lines$res_long, lines$res_lat, lines$schl_long, lines$schl_lat, col=lines$flag)
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Explore the relationships
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# set the rel data frame to remove all connections with zero (from sqldf I presume)
rel <- rel[rel$total >0, ]
# histogram
ggplot(rel, aes(total)) + geom_histogram()
# summary stats
length(rel$total)
summary(rel$total)
quantile(rel$total, c(.05, .25, .5, .75, .95))
str(rel)
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Build the graph
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
g.1 <- graph.data.frame(rel, directed=TRUE, vertices=traits)
str(g.1)
summary(g.1)
plot(g.1, vertex.label=traits$state)
tkplot(g.1, vertex.label=traits$state,
canvas.width=800,
canvas.height=800)
V(g.1)$color="grey"
V(g.1)[name %in% c('MA', 'CT', 'RI', 'VT', 'ME', 'NH')]$color <- "red"
plot(g.1, vertex.label=traits$state,
layout=layout.kamada.kawai)
tkplot(g.1, vertex.label=traits$state)
tkplot(g.1, vertex.label=traits$state,
layout=layout.fruchterman.reingold)
# only keep 100 or more
rel.2 <- rel[rel[,3] >= 100, ]
str(rel.2)
# % of rows kept after filter
nrow(rel.2) / nrow(rel)
g.2 <- graph.data.frame(rel.2, directed=TRUE, vertices=traits)
V(g.2)$color="grey"
V(g.2)[name %in% c('MA', 'CT', 'RI', 'VT', 'ME', 'NH')]$color <- "red"
V(g.2)[name %in% c('NY', 'NJ', 'PA', 'MD', 'DC', 'DE')]$color <- "yellow"
# be careful - just save out as eps with generic plot name
tkplot(g.2, vertex.label=traits$state,
canvas.width=700,
canvas.height=700,
layout=layout.fruchterman.reingold)
# get the coordinates from first graph so we can always reproduce the graph
# with the nodes in the same plot
lay <- tkplot.getcoords(2)
# example to reproduce the plot
g.2$layout <- lay # sets the layout of the nodes
tkplot(g.2, vertex.label=traits$state,
canvas.width=700,
canvas.height=700)
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Analysis
# http://www.drewconway.com/zia/wp-content/uploads/2009/08/sna_in_R.pdf
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
cent <- data.frame(bet=betweenness(g.2), eig=evcent(g.2)$vector, name=V(g.2)$name)
res <- lm(eig~bet, data=cent)$residuals
cent <- transform(cent, res=res)
# We use ggplot2 to make things a
# bit prettier
p<-ggplot(cent,aes(x=bet,y=eig,
label=name,colour=res))+
xlab("Betweenness Centrality")+
ylab("Eigenvector Centrality")
p <- p+geom_text()+opts(title="Key Actor Analysis for 2008 Migration Dataset")
p
# Key actor plot
nodes<-V(g.2)$name
nodes[which(abs(res) < .2 )] <- NA
# List the "key actors" by the defintion of .25
key.actors <- nodes[!is.na(nodes)]
key.actors
write.csv(key.actors, "key actors.csv", row.names=F)
V(g.2)$size<-abs(res)*65
#V(g.2)$name[which(abs(res) < .2 )] <- NA
V(g.2)$color="grey"
V(g.2)[name %in% c('MA', 'CT', 'RI', 'VT', 'ME', 'NH')]$color <- "red"
V(g.2)[name %in% c('NY', 'NJ', 'PA', 'MD', 'DC', 'DE')]$color <- "yellow"
tkplot(g.2,vertex.label=nodes,
canvas.width=700,
canvas.height=700)
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Cleanup the session
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
rm(list=ls())
q()
n
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment