Skip to content

Instantly share code, notes, and snippets.

@franvillamil
Created February 15, 2019 13:15
Show Gist options
  • Save franvillamil/8667ddb92a411ef1858184b05e168921 to your computer and use it in GitHub Desktop.
Save franvillamil/8667ddb92a411ef1858184b05e168921 to your computer and use it in GitHub Desktop.
R code to scrap reviews information from TripAdvisor for a number of Spanish cities
rm(list = ls())
setwd("~/Documents/Academic/courses/SocDataScience_feb19/project")
library(rvest)
library(stringr)
library(dplyr)
options(stringsAsFactors = FALSE)
url_list = c("https://www.tripadvisor.com/Restaurants-g187514-Madrid.html",
"https://www.tripadvisor.com/Restaurants-g187497-Barcelona_Catalonia.html",
"https://www.tripadvisor.com/Restaurants-g187454-Bilbao_Province_of_Vizcaya_Basque_Country.html",
"https://www.tripadvisor.com/Restaurants-g187529-Valencia_Province_of_Valencia_Valencian_Country.html",
"https://www.tripadvisor.com/Restaurants-g187438-Malaga_Costa_del_Sol_Province_of_Malaga_Andalucia.html",
"https://www.tripadvisor.com/Restaurants-g187451-Gijon_Asturias.html",
"https://www.tripadvisor.com/Restaurants-g187452-Oviedo_Asturias.html")
### Function: scrapping restaurant
read_restaurant = function(rest_page, local = "Spanish"){
name = rest_page %>%
html_node(".h1") %>%
html_text()
rank = as.integer(gsub("\\#|,", "",
rest_page %>%
html_node(".popIndexValidation span") %>%
html_text()
))
address = rest_page %>%
html_node(".address") %>%
html_text()
tags = rest_page %>%
html_node(".header_links") %>%
html_text()
type = str_sub(tags,
str_locate_all(tags, ",")[[1]][1,1]+2, -1L)
price = str_sub(tags,
1, str_locate_all(tags, ",")[[1]][1,1]-1)
reviews = gsub(",", "", rest_page %>%
html_node(".is-3") %>%
html_text() )
reviews_all = gsub("\\(|\\)|,", "", rest_page %>%
html_node(".reviews_header_count") %>%
html_text()) %>% as.integer()
# keywords = rest_page %>%
# html_node("") %>%
# html_text()
if(!is.na(reviews)){
reviews_eng = str_match(reviews, "English\\s\\((\\d+)\\)")[,2]
reviews_local = str_match(reviews, paste0(local, "\\s\\((\\d+)\\)"))[,2]
reviews_fra = str_match(reviews, "French\\s\\((\\d+)\\)")[,2]
} else {
reviews_eng = NA
reviews_local = NA
reviews_fra = NA
}
reviews_eng = as.integer(gsub(",", "", reviews_eng))
reviews_local = as.integer(gsub(",", "", reviews_local))
reviews_fra = as.integer(gsub(",", "", reviews_fra))
restaurant_data = data.frame(name = name, rank = rank, address = address,
type = type, price = price, reviews_raw = reviews,
reviews_all = reviews_all, #keywords = keywords,
reviews_eng = reviews_eng, reviews_spa = reviews_local,
reviews_fra = reviews_fra)
return(restaurant_data)
}
### SCRAPPING ### -------------------------------------------
# DF to fill in
rest_data = data.frame()
# URLs & HTML session
url = url_list[1]
home = html_session(url)
# Max number of pages
pages = gsub("\\\n", "", home %>%
html_nodes(".pageNum.taLnk") %>% html_text() )
pages = as.integer(pages[length(pages)])
# LOOP 1: Scrape page by page
for (i in 1:pages){
# status info
print(paste0("SCRAPING PAGE ", i, "/", pages))
# random waiting time
Sys.sleep(runif(1, 1, 6))
# go to next page if in page 2+
if(i != 1){
next_page = paste0("https://www.tripadvisor.com",
home %>%
html_nodes(".nav.next") %>%
html_attr("href") )
home = jump_to(home, next_page)
}
# Get restaurants in this page
rest_url = home %>%
html_nodes(".property_title") %>%
html_attr("href")
# LOOP 2: Scrape restaurants in a single page
for (j in rest_url){
# print(j)
# random waiting time
Sys.sleep(runif(1, 1, 2))
rest_page = read_html(paste0("https://www.tripadvisor.com", j))
rest_data = rbind(rest_data, read_restaurant(rest_page))
}
}
city = str_sub(url, str_locate_all(url, "-")[[1]][,2][-1L]+1, -1L)
city = gsub(".html", "", city)
filename = paste0("rest_data_", city, ".csv")
write.csv(rest_data, filename, row.names = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment