Skip to content

Instantly share code, notes, and snippets.

@kiyoto
Created March 10, 2015 05:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kiyoto/93bb63a19c50036aa308 to your computer and use it in GitHub Desktop.
Save kiyoto/93bb63a19c50036aa308 to your computer and use it in GitHub Desktop.
Script for Strata Hadoop 2015 Reviews Data Collection + Analysis
library(ggplot2)
library(dplyr)
library(scrapeR)
strata_speakers <- "http://strataconf.com/big-data-conference-ca-2015/public/schedule/speakers"
speaker_page <-
scrape(url=strata_speakers,
parse=T, headers=T)
href <- xpathSApply(speaker_page[[strata_speakers]], "//a/@href")
href <- unique(href)
href <- href[grepl('^/big-data-conference-ca-2015/public/schedule/detail/\\d+', href)]
href <- paste("http://strataconf.com", href, sep="")
# Helper method to scrape the data from individual URL
get_talk_metadata <- function(url) {
html <- scrape(url=url, parse=T, headers=T)
time <- xpathSApply(html[[url]],
"//div[@class='session_time']/text()", xmlValue)[[1]]
time <- gsub('[\\t\\r\\n]*', '', time, perl=T)
date <- strsplit(time, ", ")[[1]][[2]]
time <- strsplit(time, "m")[[1]][[1]]
talk_title <- xpathSApply(html[[url]], '//h1[@class="summary"]/text()', xmlValue)[[1]]
talk_location <- xpathSApply(html[[url]], '//span[@class="location"]/text()', xmlValue)[[1]]
category <- xpathSApply(html[[url]], '//span[@class="en_session_topics category"]/text()', xmlValue)[[1]]
category <- gsub('[\\t\\r\\n]*', '', category, perl=T)
rating_string <- xpathSApply(html[[url]],
"//div[@class='en_grade_average_detail']/text()", xmlValue)
if (is.null(rating_string)) {
avg_point <- NA
num_reviews <- 0
} else {
rating_string <- rating_string[[1]]
rating_string <- regmatches(rating_string, regexpr("[\\d\\.]+, \\d+", rating_string, perl=T))
s <- as.numeric(strsplit(rating_string, ", ")[[1]])
avg_point <- s[[1]]
num_reviews <- s[[2]]
}
list(avg_point=avg_point, num_reviews=num_reviews,
time=time, date=date, title=talk_title,
location=talk_location, category=category)
}
# initialize data frame columns
avg_points <- c()
num_reviews <- c()
time <- c()
date <- c()
title <- c()
location <- c()
category <- c()
# Get all the data
for (link in href) {
metadata <- get_talk_metadata(link)
avg_points <- c(avg_points, metadata$avg_point)
num_reviews <- c(num_reviews, metadata$num_reviews)
time <- c(time, metadata$time)
date <- c(date, metadata$date)
title <- c(title, metadata$title)
location <- c(location, metadata$location)
category <- c(category, metadata$category)
Sys.sleep(1)
}
strata2015_talks <- data.frame(list(avg_points=avg_points,
num_reviews=num_reviews,
time=time,
date=date,
title=title,
location=location,
category=category))
# Data is ready!
# Mark sponsored
p$sponsored <- grepl("sponsor", tolower(p$category))
# scatter plot
ggplot(strata2015_talks, aes(x=avg_points, y=num_reviews)) +
geom_point(size=4, color="#b11113", aes(color=clusters$cluster)) +
ggtitle("# of Reviews v. Average Points") +
theme(plot.title = element_text(size=24, vjust=1.8)) +
annotate("rect", xmin=4, xmax=5, ymin=30, ymax=45, alpha=0.3) +
annotate("text", label="Many good reviews", x=4.5, y=40, size=8) +
annotate("rect", xmin=2, xmax=2.7, ymin=30, ymax=40, alpha=0.3) +
annotate("text", label="Many bad reviews", x = 2.35, y = 37, size=8)
p<-strata2015_talks %>%
filter(!is.na(avg_points)) %>%
group_by(category) %>%
summarise(avg_points = stats::weighted.mean(avg_points,num_reviews), num_reviews=sum(num_reviews)) %>%
arrange(desc(avg_points))
p$overall_avg_points <- stats::weighted.mean(avg_points, num_reviews)
# Shrink to the average to account for sample size
# c.f. http://stats.stackexchange.com/questions/15979/how-to-find-confidence-intervals-for-ratings/16053#16053
p$adjusted_avg_points <- with(p, avg_points*num_reviews/(num_reviews+1)+overall_avg_points/(num_reviews+1))
# The plot function
ggplot(p)+
geom_bar(aes(x=reorder(category, adjusted_avg_points),
y=adjusted_avg_points, fill=sponsored),
stat="identity") +
coord_flip() +
xlab("category") +
theme(plot.title=element_text(size=24, vjust=1.8),
axis.text=element_text(color="#000000"))+
ggtitle("Strata Hadoop 2015 Average Ratings per Category")+
scale_fill_manual(values=c("#777777","#B11113"))+
geom_hline(yintercept=3.75, color="#005000", size=2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment