franvillamil/scrap_trip_advisor.R

## scrap_trip_advisor.R
rm(list = ls())
setwd("~/Documents/Academic/courses/SocDataScience_feb19/project")
library(rvest)
library(stringr)
library(dplyr)
options(stringsAsFactors = FALSE)

url_list = c("https://www.tripadvisor.com/Restaurants-g187514-Madrid.html",
  "https://www.tripadvisor.com/Restaurants-g187497-Barcelona_Catalonia.html",
  "https://www.tripadvisor.com/Restaurants-g187454-Bilbao_Province_of_Vizcaya_Basque_Country.html",
  "https://www.tripadvisor.com/Restaurants-g187529-Valencia_Province_of_Valencia_Valencian_Country.html",
  "https://www.tripadvisor.com/Restaurants-g187438-Malaga_Costa_del_Sol_Province_of_Malaga_Andalucia.html",
  "https://www.tripadvisor.com/Restaurants-g187451-Gijon_Asturias.html",
  "https://www.tripadvisor.com/Restaurants-g187452-Oviedo_Asturias.html")

### Function: scrapping restaurant

read_restaurant = function(rest_page, local = "Spanish"){

  name = rest_page %>%
    html_node(".h1") %>%
    html_text()
  rank = as.integer(gsub("\\#|,", "",
          rest_page %>%
            html_node(".popIndexValidation span") %>%
            html_text()
          ))
  address = rest_page %>%
    html_node(".address") %>%
    html_text()
  tags = rest_page %>%
    html_node(".header_links") %>%
    html_text()
  type = str_sub(tags,
    str_locate_all(tags, ",")[[1]][1,1]+2, -1L)
  price = str_sub(tags,
    1, str_locate_all(tags, ",")[[1]][1,1]-1)
  reviews = gsub(",", "", rest_page %>%
                   html_node(".is-3") %>%
                   html_text() )
  reviews_all = gsub("\\(|\\)|,", "", rest_page %>%
                       html_node(".reviews_header_count") %>%
                       html_text()) %>% as.integer()
  # keywords = rest_page %>%
  #   html_node("") %>%
  #   html_text()

  if(!is.na(reviews)){
    reviews_eng = str_match(reviews, "English\\s\\((\\d+)\\)")[,2]
    reviews_local = str_match(reviews, paste0(local, "\\s\\((\\d+)\\)"))[,2]
    reviews_fra = str_match(reviews, "French\\s\\((\\d+)\\)")[,2]
  } else {
    reviews_eng = NA
    reviews_local = NA
    reviews_fra = NA
  }

  reviews_eng = as.integer(gsub(",", "", reviews_eng))
  reviews_local = as.integer(gsub(",", "", reviews_local))
  reviews_fra = as.integer(gsub(",", "", reviews_fra))

  restaurant_data = data.frame(name = name, rank = rank, address = address,
                               type = type, price = price, reviews_raw = reviews,
                               reviews_all = reviews_all, #keywords = keywords,
                               reviews_eng = reviews_eng, reviews_spa = reviews_local,
                                reviews_fra = reviews_fra)
  return(restaurant_data)

}

### SCRAPPING ### -------------------------------------------

# DF to fill in
rest_data = data.frame()

# URLs & HTML session
url = url_list[1]
home = html_session(url)

# Max number of pages
pages = gsub("\\\n", "", home %>%
  html_nodes(".pageNum.taLnk") %>% html_text() )
pages = as.integer(pages[length(pages)])

# LOOP 1: Scrape page by page
for (i in 1:pages){
  # status info
  print(paste0("SCRAPING PAGE ", i, "/", pages))
  # random waiting time
  Sys.sleep(runif(1, 1, 6))

  # go to next page if in page 2+
  if(i != 1){
    next_page = paste0("https://www.tripadvisor.com",
                       home %>%
                         html_nodes(".nav.next") %>%
                         html_attr("href") )
    home = jump_to(home, next_page)
  }

  # Get restaurants in this page
  rest_url = home %>%
    html_nodes(".property_title") %>%
    html_attr("href")

  # LOOP 2: Scrape restaurants in a single page
  for (j in rest_url){
    # print(j)
    # random waiting time
    Sys.sleep(runif(1, 1, 2))
    rest_page = read_html(paste0("https://www.tripadvisor.com", j))
    rest_data = rbind(rest_data, read_restaurant(rest_page))
  }

}

city = str_sub(url, str_locate_all(url, "-")[[1]][,2][-1L]+1, -1L)
city = gsub(".html", "", city)
filename = paste0("rest_data_", city, ".csv")
write.csv(rest_data, filename, row.names = FALSE)
	rm(list = ls())
	setwd("~/Documents/Academic/courses/SocDataScience_feb19/project")
	library(rvest)
	library(stringr)
	library(dplyr)
	options(stringsAsFactors = FALSE)

	url_list = c("https://www.tripadvisor.com/Restaurants-g187514-Madrid.html",
	"https://www.tripadvisor.com/Restaurants-g187497-Barcelona_Catalonia.html",
	"https://www.tripadvisor.com/Restaurants-g187454-Bilbao_Province_of_Vizcaya_Basque_Country.html",
	"https://www.tripadvisor.com/Restaurants-g187529-Valencia_Province_of_Valencia_Valencian_Country.html",
	"https://www.tripadvisor.com/Restaurants-g187438-Malaga_Costa_del_Sol_Province_of_Malaga_Andalucia.html",
	"https://www.tripadvisor.com/Restaurants-g187451-Gijon_Asturias.html",
	"https://www.tripadvisor.com/Restaurants-g187452-Oviedo_Asturias.html")

	### Function: scrapping restaurant

	read_restaurant = function(rest_page, local = "Spanish"){

	name = rest_page %>%
	html_node(".h1") %>%
	html_text()
	rank = as.integer(gsub("\\#\|,", "",
	rest_page %>%
	html_node(".popIndexValidation span") %>%
	html_text()
	))
	address = rest_page %>%
	html_node(".address") %>%
	html_text()
	tags = rest_page %>%
	html_node(".header_links") %>%
	html_text()
	type = str_sub(tags,
	str_locate_all(tags, ",")[[1]][1,1]+2, -1L)
	price = str_sub(tags,
	1, str_locate_all(tags, ",")[[1]][1,1]-1)
	reviews = gsub(",", "", rest_page %>%
	html_node(".is-3") %>%
	html_text() )
	reviews_all = gsub("\\(\|\\)\|,", "", rest_page %>%
	html_node(".reviews_header_count") %>%
	html_text()) %>% as.integer()
	# keywords = rest_page %>%
	# html_node("") %>%
	# html_text()

	if(!is.na(reviews)){
	reviews_eng = str_match(reviews, "English\\s\\((\\d+)\\)")[,2]
	reviews_local = str_match(reviews, paste0(local, "\\s\\((\\d+)\\)"))[,2]
	reviews_fra = str_match(reviews, "French\\s\\((\\d+)\\)")[,2]
	} else {
	reviews_eng = NA
	reviews_local = NA
	reviews_fra = NA
	}

	reviews_eng = as.integer(gsub(",", "", reviews_eng))
	reviews_local = as.integer(gsub(",", "", reviews_local))
	reviews_fra = as.integer(gsub(",", "", reviews_fra))

	restaurant_data = data.frame(name = name, rank = rank, address = address,
	type = type, price = price, reviews_raw = reviews,
	reviews_all = reviews_all, #keywords = keywords,
	reviews_eng = reviews_eng, reviews_spa = reviews_local,
	reviews_fra = reviews_fra)
	return(restaurant_data)

	}

	### SCRAPPING ### -------------------------------------------

	# DF to fill in
	rest_data = data.frame()

	# URLs & HTML session
	url = url_list[1]
	home = html_session(url)

	# Max number of pages
	pages = gsub("\\\n", "", home %>%
	html_nodes(".pageNum.taLnk") %>% html_text() )
	pages = as.integer(pages[length(pages)])

	# LOOP 1: Scrape page by page
	for (i in 1:pages){
	# status info
	print(paste0("SCRAPING PAGE ", i, "/", pages))
	# random waiting time
	Sys.sleep(runif(1, 1, 6))

	# go to next page if in page 2+
	if(i != 1){
	next_page = paste0("https://www.tripadvisor.com",
	home %>%
	html_nodes(".nav.next") %>%
	html_attr("href") )
	home = jump_to(home, next_page)
	}

	# Get restaurants in this page
	rest_url = home %>%
	html_nodes(".property_title") %>%
	html_attr("href")

	# LOOP 2: Scrape restaurants in a single page
	for (j in rest_url){
	# print(j)
	# random waiting time
	Sys.sleep(runif(1, 1, 2))
	rest_page = read_html(paste0("https://www.tripadvisor.com", j))
	rest_data = rbind(rest_data, read_restaurant(rest_page))
	}

	}

	city = str_sub(url, str_locate_all(url, "-")[[1]][,2][-1L]+1, -1L)
	city = gsub(".html", "", city)
	filename = paste0("rest_data_", city, ".csv")
	write.csv(rest_data, filename, row.names = FALSE)