public
Created

Data Science DC Titles Visualization

  • Download Gist
DSDC-Titles.R
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
# Data Science DC Titles Visualization
 
# Here's how this will work. In a main loop, a parameterized visualization function
# is called every N seconds. Each function gets the source spreadsheet fresh, and
# generates a visual.
 
# aspects of this code borrowed from Drew Conway:
# https://raw.github.com/drewconway/ZIA/master/R/better_word_cloud/better_word_cloud.R
 
library(plyr)
library(ggplot2)
library(tm)
 
options(stringsAsFactors=FALSE)
 
loop.time <- 15
 
source.data.url <- 'https://docs.google.com/spreadsheet/pub?hl=en_US&hl=en_US&key=0AnaXKp9bt6OXdEhYWmFocmgwU1RBa01qX0ttZ0JZaVE&single=true&gid=0&output=csv'
 
optimal.spacing<-function(spaces) {
if(spaces>1) {
spacing<-1/spaces
if(spaces%%2 > 0) {
lim<-spacing*floor(spaces/2)
return(seq(-lim,lim,spacing))
}
else {
lim<-spacing*(spaces-1)
return(seq(-lim,lim,spacing*2))
}
}
else {
return(0)
}
}
 
plot.function <- function(column, col.value, title) {
temporaryFile <- tempfile()
download.file(url=source.data.url,destfile=temporaryFile, method="curl")
dat <- read.csv(temporaryFile)
names(dat) <- c('Timestamp', 'Title', 'DataScientist', 'Sector', 'Education', 'Training')
# make a DT matrix
titles.corpus <- Corpus(DataframeSource(subset(dat, select=c('Title'))))
titles.matrix <- TermDocumentMatrix(titles.corpus, control=list(stopwords=stopwords(), removeNumbers=TRUE, removePunctuation=TRUE))
titles.matrix.df <- as.data.frame(inspect(titles.matrix))
yes.cols <- grepl(col.value, dat[,column])
words.yes <- rowSums(titles.matrix.df[,yes.cols])
words.no <- rowSums(titles.matrix.df[,!yes.cols])
words.diff <- data.frame(words=names(words.yes), freq=words.yes+words.no, count.diff=words.yes-words.no)
spacing <- sapply(table(words.diff$count.diff), optimal.spacing)
words.df <- ddply(words.diff, .(count.diff), function(cw) {
cbind(cw, ypos=unlist(spacing[as.character(cw$count.diff[[1]])]))
})
min.count <- pmin(-.1, min(words.df$count.diff))
max.count <- pmax(.1, max(words.df$count.diff))
wc <- ggplot(words.df, aes(count.diff, ypos, label=words, size=freq, colour=count.diff)) +
geom_text() +
scale_size(to=c(3,11), name='Word Frequency') +
scale_colour_gradient2(low='darkred', mid='black', high='darkblue', midpoint=0, legend=FALSE) +
scale_x_continuous('', breaks=c(min.count, 0, max.count),
labels=c('Less', 'Same', 'More')) +
scale_y_continuous('', breaks=c(0), labels='') +
coord_cartesian(xlim=c(min.count*1.2, max.count*1.2)) +
theme_bw() +
opts(panel.grid.major=theme_blank(),panel.grid.minor=theme_blank(),
title=title)
 
print(wc)
}
 
plots <- data.frame(column=c('DataScientist', 'Sector', 'Sector', 'Sector',
'Education', 'Education', 'Training',
'Training', 'Training', 'Training'),
col.value=c('Yes', 'Private', 'Public', 'Academic',
'Masters', 'Doctoral', 'Statistics',
'Machine Learning', 'Sciences', 'Business'),
title=c('Data Scientist = Yes', 'Private Sector', 'Public Sector', 'Academia',
'Masters Degree', 'PhD', 'Statistics Training',
'ML Training', 'Science Training', 'Business Training'))
 
row=1
while(1){
do.call(plot.function, as.list(plots[row, ]))
Sys.sleep(loop.time)
row = (row + 1)
if (row > nrow(plots)) row <- 1
}

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.