Skip to content

Instantly share code, notes, and snippets.

@shreyaskarnik
Created July 31, 2011 00:49
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shreyaskarnik/1116203 to your computer and use it in GitHub Desktop.
Save shreyaskarnik/1116203 to your computer and use it in GitHub Desktop.
RCode courtesy @HarlanH (twitter)
#This is my own interpatation of USA.gov PubSub feed with some tips and code from HarlanH from twitter.
#I am interested finding out links about which agency are shared from which part of US.
library(stringr)
library(plyr)
library(ggplot2)
library(scrapeR)
library(RJSONIO)
library(colorspace)
library(RColorBrewer)
library(maps)
data(us.cities)
###getting the data
#cbgColourPalette <- scale_colour_manual(values=c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7","#CC24A7","#C679A7"))
#
options(stringsAsFactors=FALSE)
index <- getURL('http://bitly.measuredvoice.com/bitly_archive/?C=M;O=D')
files <- str_replace(str_sub(str_extract_all(index, 'href="(.+?)"')[[1]], start=7), '"', '')
files <- files[str_detect(files, 'bitly')]
naifnull <- function(a,b) { if (is.null(a)) NA else b }
# sample a few dozen files and merge them
n.files=10
n.top=10
dat.samp <- ldply(sample(files[1:n.files], n.files), function (ff) {
dat.txt<-str_split(getURL(paste('http://bitly.measuredvoice.com/bitly_archive/', ff, sep='')), '\n')[[1]]
ldply(dat.txt, function(jj) { if (str_sub(jj,1,1)=='{') {
ll <- fromJSON(jj) ;
if (length(ll) > 1 ) data.frame(known_user=ll$nk,
country=naifnull(ll$c,ll$c),
geo_city_name=naifnull(ll$cy,ll$cy),
lat=naifnull(ll$ll, ll$ll[[1]]),
lon=naifnull(ll$ll, ll$ll[[2]]),
timestamp=as.POSIXct(ll$t, origin="1970-01-01", tz="GMT"),
hash_timestamp=as.POSIXct(ll$hc, origin="1970-01-01", tz="GMT"),
long_url=ll$u,
referring_url=ll$r) else NULL
} else NULL
})
}, .progress='text')
idx_us<-which(dat.samp$country=="US")
dat.samp<-dat.samp[idx_us,]
dat.samp$agency <- with(dat.samp, str_extract(long_url, '[[:alpha:]]+.gov'))
na_agency_index<-which(is.na(dat.samp$agency))
na_city_index<-which(is.na(dat.samp$geo_city_name))
na_full<-union(na_agency_index,na_city_index)
dat.samp_clean<-dat.samp[-na_full,]
common.agencies <- names(head(sort(table(dat.samp$agency), decreasing=TRUE), n.top))
dat.common.agency <- subset(dat.samp_clean, subset=agency %in% common.agencies)
top_n_agencies<-names(head(sort(table(dat.samp$agency), decreasing=TRUE),n.top))
####Some Expts
#colours<-c("#2f4c3d","#d741bb","#0c96c8","#a982ff","#585bed","#7b135e","#8d0a30","#d38205","#d1003d","#ac132e")
colours<-brewer.pal(n.top,"Paired")
cbgColourPalette<-scale_color_manual(values=colours)
th = theme_bw()
th$panel.background = theme_rect(fill = "gray", colour = NA)
theme_set(th)
g = ggplot(data=us.cities)
g = g + geom_point(aes(x=dat.common.agency$lon,y=dat.common.agency$lat,colour=dat.common.agency$agency),size =I(3)) +borders("state", size = 0.5)
g = g + scale_x_continuous(limits = c(-125,-66), breaks = NA)
g = g + scale_y_continuous(limits = c(25,50), breaks = NA)
g = g + cbgColourPalette
g = g + labs(x=NULL, y=NULL)
g = g + opts(title = 'Top 10 Agencies by Location', plot.title = theme_text(colour = 'black', size = 12,hjust = 0.5, vjust = 0.5, face = 'bold'))
g = g + opts(legend.key = theme_rect(colour = 'gray', fill = 'black', size = 0.1))
print(g)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment