Skip to content

Instantly share code, notes, and snippets.

@MattSandy
Created May 7, 2019 20:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MattSandy/e5b02eeccc0a2ad296e002d180846719 to your computer and use it in GitHub Desktop.
Save MattSandy/e5b02eeccc0a2ad296e002d180846719 to your computer and use it in GitHub Desktop.
IMDB scraping for Avengers:Endgame reviews
library(rvest)
library(magrittr)
library(stringr)
library(lubridate)
library(data.table)
library(ggplot2)
library(patchwork)
library(tidyverse)
# Functions ---------------------------------------------------------------
get_reviews <- function(paginationKey) {
url <- paste0("https://www.imdb.com/title/tt4154796/reviews/_ajax?sort=submissionDate&dir=asc&ref_=undefined&paginationKey=",paginationKey)
webpage <- read_html(url)
# Get Ratings -------------------------------------------------------------
review_rating <- webpage %>% html_nodes(".imdb-user-review") %>% lapply(function(node) {
rating <- node %>% html_nodes("span.rating-other-user-rating") %>%
html_text() %>% str_match("([0-9]{1,2})\\/") %>% .[,2] %>%
as.numeric %>% unlist
if(length(rating)<1) {
return(-1)
} else {
return(rating)
}
}) %>% unlist
review_date <- webpage %>% html_nodes("div.display-name-date span.review-date") %>%
html_text() %>% dmy
review_text <- webpage %>% html_nodes("div.content div.text") %>%
html_text()
review_author <- webpage %>% html_nodes("span.display-name-link a") %>%
html_text()
review_title <- webpage %>% html_nodes("div.lister-item-content a.title") %>%
html_text()
review_id <- webpage %>% html_nodes(".imdb-user-review") %>% html_attr("data-review-id")
df <- data.frame(Rating = review_rating,
Date = review_date,
Author = review_author,
Title = review_title,
Text = review_text,
ID = review_id)
list(paginationKey = xml_attrs(xml_child(xml_child(xml_child(webpage, 1), 1), 2))[["data-key"]],
df = df) %>% return
}
# Loop --------------------------------------------------------------------
reviews <- list()
paginationKey <- ""
for(i in 1:1000) {
print(i)
result <- get_reviews(paginationKey)
reviews[[i]] <- result$df
paginationKey <- result$paginationKey
}
df <- rbindlist(reviews)
df <- df[which(!duplicated(df$ID)),]
df <- df[which(df$Rating>0),]
df.counts <- expand.grid(Date = unique(df$Date),Rating = unique(df$Rating))
df.counts$cumulative <- apply(df.counts,1,function(row){
row %>% print
row[["Rating"]] %>% print
as.numeric(row[["Rating"]]) %>% print
# df %>%
# group_by(Rating,Date) %>% summarise(count = n()) %>%
# filter(Rating == 10 & Date <= "2019-05-02") %>% .["count"] %>% sum
counts <- df %>%
group_by(Rating,Date) %>% summarise(count = n()) %>%
filter(Rating == as.numeric(row[["Rating"]]) & Date <= row[["Date"]]) %>%
.["count"]
if(!length(counts)>0) {
return(0)
} else {
return(sum(counts))
}
})
df.counts$Rating <- factor(df.counts$Rating, levels=10:1)
df$Rating <- factor(df$Rating, levels=10:1)
df$Rating
pos1.A <- (df.counts %>% filter(Date=="2019-04-26" & as.numeric(as.character(Rating))<10) %>% .["cumulative"] %>% sum)
pos1.B <- (df.counts %>% filter(Date=="2019-04-26" & as.numeric(as.character(Rating))==10) %>% .["cumulative"] %>% sum)
pos1 <- (pos1.A / (pos1.A + pos1.B)) + (pos1.B / (pos1.A + pos1.B) / 2)
p1 <- ggplot(df.counts,aes(fill=Rating)) +
geom_area(aes(x = Date, y = cumulative),position = "fill",stat = "identity") +
scale_fill_viridis_d(name = "Rating",direction = -1) + theme_minimal() +
scale_y_continuous(breaks = c(0,.25,.5,.75,1),labels = c("0%","25%","50%","75%","100%")) +
guides(fill=guide_legend(ncol=1)) +
scale_x_date(date_breaks = "2 days") +
theme(legend.position = "right",legend.direction = "vertical") +
ggtitle("Avengers: Endgame User Ratings by Date") +
labs(subtitle = "Data Source: https://www.imdb.com/title/tt4154796/") +
ylab("Cumulative Rating Percent by Date") +
xlab("") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_vline(xintercept=as.Date("2019-04-26"), linetype="dashed", color = "red") +
geom_text(aes(x=as.Date("2019-04-26"), label="Release Date\n", y=pos1), colour="black", angle=90)
pos2 <- (df %>% filter(Date=="2019-04-26" & as.numeric(as.character(Rating))<10) %>% nrow) +
(df %>% filter(Date=="2019-04-26" & as.numeric(as.character(Rating))==10) %>% nrow()/2)
p2 <- ggplot(df,aes(fill=Rating)) +
geom_bar(aes(x = Date)) +
scale_fill_viridis_d(name = "Rating",direction = -1) + theme_minimal() +
guides(fill=guide_legend(ncol=1)) +
theme(legend.position = "none") +
ggtitle("") +
labs(subtitle = "") +
ylab("Rating Count by Date") +
xlab("") +
scale_x_date(date_breaks = "2 days") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_vline(xintercept=as.Date("2019-04-26"), linetype="dashed", color = "red") +
geom_text(aes(x=as.Date("2019-04-26"), label="Release Date\n", y=pos2), colour="black", angle=90)
ggsave(filename = "plot.png",p1 + p2,width = 11,units = "in")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment