HarlanH/DSDC-Titles.R

## DSDC-Titles.R
# Data Science DC Titles Visualization

# Here's how this will work. In a main loop, a parameterized visualization function
# is called every N seconds. Each function gets the source spreadsheet fresh, and
# generates a visual.

# aspects of this code borrowed from Drew Conway:
# https://raw.github.com/drewconway/ZIA/master/R/better_word_cloud/better_word_cloud.R

library(plyr)
library(ggplot2)
library(tm)

options(stringsAsFactors=FALSE)

loop.time <- 15

source.data.url <- 'https://docs.google.com/spreadsheet/pub?hl=en_US&hl=en_US&key=0AnaXKp9bt6OXdEhYWmFocmgwU1RBa01qX0ttZ0JZaVE&single=true&gid=0&output=csv'

optimal.spacing<-function(spaces) {
    if(spaces>1) {
        spacing<-1/spaces
        if(spaces%%2 > 0) {
            lim<-spacing*floor(spaces/2)
            return(seq(-lim,lim,spacing))
        }
        else {
            lim<-spacing*(spaces-1)
            return(seq(-lim,lim,spacing*2))
        }
    }
    else {
        return(0)
    }
}

plot.function <- function(column, col.value, title) {
  temporaryFile <- tempfile()
  download.file(url=source.data.url,destfile=temporaryFile, method="curl")
  dat <- read.csv(temporaryFile)

  names(dat) <- c('Timestamp', 'Title', 'DataScientist', 'Sector', 'Education', 'Training')

  # make a DT matrix
  titles.corpus <- Corpus(DataframeSource(subset(dat, select=c('Title'))))
  titles.matrix <- TermDocumentMatrix(titles.corpus, control=list(stopwords=stopwords(), removeNumbers=TRUE, removePunctuation=TRUE))
  titles.matrix.df <- as.data.frame(inspect(titles.matrix))

  yes.cols <- grepl(col.value, dat[,column])

  words.yes <- rowSums(titles.matrix.df[,yes.cols])
  words.no <- rowSums(titles.matrix.df[,!yes.cols])

  words.diff <- data.frame(words=names(words.yes), freq=words.yes+words.no, count.diff=words.yes-words.no)


  spacing <- sapply(table(words.diff$count.diff), optimal.spacing)

  words.df <- ddply(words.diff, .(count.diff), function(cw) {
    cbind(cw, ypos=unlist(spacing[as.character(cw$count.diff[[1]])]))
  })

  min.count <- pmin(-.1, min(words.df$count.diff))
  max.count <- pmax(.1, max(words.df$count.diff))

  wc <- ggplot(words.df, aes(count.diff, ypos, label=words, size=freq, colour=count.diff)) +
    geom_text() +
    scale_size(to=c(3,11), name='Word Frequency') +
    scale_colour_gradient2(low='darkred', mid='black', high='darkblue', midpoint=0, legend=FALSE) +
    scale_x_continuous('', breaks=c(min.count, 0, max.count),
                       labels=c('Less', 'Same', 'More')) +
    scale_y_continuous('', breaks=c(0), labels='') +
    coord_cartesian(xlim=c(min.count*1.2, max.count*1.2)) +
    theme_bw() +
    opts(panel.grid.major=theme_blank(),panel.grid.minor=theme_blank(),
         title=title)

  print(wc)
}

plots <- data.frame(column=c('DataScientist', 'Sector', 'Sector', 'Sector',
                             'Education', 'Education', 'Training',
                             'Training', 'Training', 'Training'),
                    col.value=c('Yes', 'Private', 'Public', 'Academic',
                                'Masters', 'Doctoral', 'Statistics',
                                'Machine Learning', 'Sciences', 'Business'),
                    title=c('Data Scientist = Yes', 'Private Sector', 'Public Sector', 'Academia',
                            'Masters Degree', 'PhD', 'Statistics Training',
                            'ML Training', 'Science Training', 'Business Training'))

row=1
while(1){
  do.call(plot.function, as.list(plots[row, ]))

  Sys.sleep(loop.time)

  row = (row + 1)
  if (row > nrow(plots)) row <- 1
}
	# Data Science DC Titles Visualization

	# Here's how this will work. In a main loop, a parameterized visualization function
	# is called every N seconds. Each function gets the source spreadsheet fresh, and
	# generates a visual.

	# aspects of this code borrowed from Drew Conway:
	# https://raw.github.com/drewconway/ZIA/master/R/better_word_cloud/better_word_cloud.R

	library(plyr)
	library(ggplot2)
	library(tm)

	options(stringsAsFactors=FALSE)

	loop.time <- 15

	source.data.url <- 'https://docs.google.com/spreadsheet/pub?hl=en_US&hl=en_US&key=0AnaXKp9bt6OXdEhYWmFocmgwU1RBa01qX0ttZ0JZaVE&single=true&gid=0&output=csv'

	optimal.spacing<-function(spaces) {
	if(spaces>1) {
	spacing<-1/spaces
	if(spaces%%2 > 0) {
	lim<-spacing*floor(spaces/2)
	return(seq(-lim,lim,spacing))
	}
	else {
	lim<-spacing*(spaces-1)
	return(seq(-lim,lim,spacing*2))
	}
	}
	else {
	return(0)
	}
	}

	plot.function <- function(column, col.value, title) {
	temporaryFile <- tempfile()
	download.file(url=source.data.url,destfile=temporaryFile, method="curl")
	dat <- read.csv(temporaryFile)

	names(dat) <- c('Timestamp', 'Title', 'DataScientist', 'Sector', 'Education', 'Training')

	# make a DT matrix
	titles.corpus <- Corpus(DataframeSource(subset(dat, select=c('Title'))))
	titles.matrix <- TermDocumentMatrix(titles.corpus, control=list(stopwords=stopwords(), removeNumbers=TRUE, removePunctuation=TRUE))
	titles.matrix.df <- as.data.frame(inspect(titles.matrix))

	yes.cols <- grepl(col.value, dat[,column])

	words.yes <- rowSums(titles.matrix.df[,yes.cols])
	words.no <- rowSums(titles.matrix.df[,!yes.cols])

	words.diff <- data.frame(words=names(words.yes), freq=words.yes+words.no, count.diff=words.yes-words.no)


	spacing <- sapply(table(words.diff$count.diff), optimal.spacing)

	words.df <- ddply(words.diff, .(count.diff), function(cw) {
	cbind(cw, ypos=unlist(spacing[as.character(cw$count.diff[[1]])]))
	})

	min.count <- pmin(-.1, min(words.df$count.diff))
	max.count <- pmax(.1, max(words.df$count.diff))

	wc <- ggplot(words.df, aes(count.diff, ypos, label=words, size=freq, colour=count.diff)) +
	geom_text() +
	scale_size(to=c(3,11), name='Word Frequency') +
	scale_colour_gradient2(low='darkred', mid='black', high='darkblue', midpoint=0, legend=FALSE) +
	scale_x_continuous('', breaks=c(min.count, 0, max.count),
	labels=c('Less', 'Same', 'More')) +
	scale_y_continuous('', breaks=c(0), labels='') +
	coord_cartesian(xlim=c(min.count1.2, max.count1.2)) +
	theme_bw() +
	opts(panel.grid.major=theme_blank(),panel.grid.minor=theme_blank(),
	title=title)

	print(wc)
	}

	plots <- data.frame(column=c('DataScientist', 'Sector', 'Sector', 'Sector',
	'Education', 'Education', 'Training',
	'Training', 'Training', 'Training'),
	col.value=c('Yes', 'Private', 'Public', 'Academic',
	'Masters', 'Doctoral', 'Statistics',
	'Machine Learning', 'Sciences', 'Business'),
	title=c('Data Scientist = Yes', 'Private Sector', 'Public Sector', 'Academia',
	'Masters Degree', 'PhD', 'Statistics Training',
	'ML Training', 'Science Training', 'Business Training'))

	row=1
	while(1){
	do.call(plot.function, as.list(plots[row, ]))

	Sys.sleep(loop.time)

	row = (row + 1)
	if (row > nrow(plots)) row <- 1
	}