Skip to content

Instantly share code, notes, and snippets.

@saasindustries
Created February 3, 2021 09:21
Show Gist options
  • Save saasindustries/a2a1272b7e4d9e935a0d79c68c9b138d to your computer and use it in GitHub Desktop.
Save saasindustries/a2a1272b7e4d9e935a0d79c68c9b138d to your computer and use it in GitHub Desktop.
library(rvest)
library(dplyr)
get_cast = function(movie_link) {
movie_page = read_html(movie_link)
movie_cast = movie_page %>% html_nodes(".primary_photo+ td a") %>% html_text() %>% paste(collapse = ",")
return(movie_cast)
}
movie_list = data.frame()
for (page_result in seq(from = 1, to = 101, by = 50)){
link = paste0("https://www.imdb.com/search/title/?title_type=feature&year=2020-01-01,2020-12-31&start=", page_result , "&ref_=adv_nxt")
page = read_html(link)
name = page %>% html_nodes(".lister-item-header a") %>% html_text()
movie_links = page %>% html_nodes(".lister-item-header a") %>% html_attr("href") %>% paste("https://www.imdb.com", ., sep = "")
year = page %>% html_nodes(".text-muted.unbold") %>% html_text()
runtime = page %>% html_nodes(".runtime") %>% html_text()
genre = page %>% html_nodes(".genre") %>% html_text()
synopsis = page %>% html_nodes(".ratings-bar+ .text-muted") %>% html_text()
rating = page %>% html_nodes(".ratings-imdb-rating strong") %>% html_text()
votes = page %>% html_nodes(".sort-num_votes-visible span:nth-child(2)") %>% html_text()
cast = sapply(movie_links, FUN = get_cast, USE.NAMES = FALSE)
movie_list = rbind(movie_list, data.frame(name, year, runtime, genre, synopsis, rating, votes, cast, stringsAsFactors = FALSE))
print(paste("Page : ", page_result))
}
write.csv(movie_list, "Feature Films(2020-01-01 and 2020-12-31).csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment