najibninaba/TripAdvisor_reviewScraper.py

## TripAdvisor_reviewScraper.py
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 28 23:56:40 2017

@author: Dustin Fontaine
"""

# tripadvisor Scrapper - use this one to scrape hotels

# importing libraries
from bs4 import BeautifulSoup
import urllib
import os
import urllib.request


# creating CSV file to be used

file = open(os.path.expanduser(r"TripAdviserReviews.csv"), "wb")

file.write(
    b"Organization,Address,Reviewer,Review Title,Review,Review Count,Location,Rating Date,Rating" + b"\n")

# List the first page of the reviews (ends with "#REVIEWS") - separate the websites with ,
WebSites = ["https://www.tripadvisor.com.sg/Hotel_Review-g60763-d93623-Reviews-Trump_International_Hotel_and_Tower_New_York-New_York_City_New_York.html"]

            # looping through each site until it hits a break
for theurl in WebSites:
    reviewNumb = 0
    thepage = urllib.request.urlopen(theurl)
    soup = BeautifulSoup(thepage, "html.parser")
    while True:
        Organization = soup.find(attrs={"class": "header heading fr"}).text.replace('"', ' ').replace('Review of',' ').strip()
        Address = soup.findAll(attrs={"class": "street-address"})[0].text.replace(',', '').replace('\n', '').strip()

        # Loop through each review on the page
        #for x in range(0, len(soup.findAll(attrs={"class": "avatarWrapper"}))):
        for x in range(5)
            try:
                Reviewer = soup.findAll(attrs={"class": "username mo"})[x].text
            except:
                Reviewer = "N/A"
                continue

            Reviewer = Reviewer.replace(',', ' ').replace('”', '').replace('“', '').replace('"', '').strip()
            ReviewCount = soup.findAll(attrs={"class": "badgetext"})[x].text.split(' ', 1)[0].strip()
            try:
                Location = soup.findAll(attrs={"class": "location"})[x].text.replace(',', ' ').strip()
            except:
                Location= 'Unknown'
            ReviewTitle = soup.findAll(attrs={"class": "quote"})[x].text.replace(',', ' ').replace('”', '').replace('“','').replace('"', '').replace('é', 'e').strip()
            Review = soup.findAll(attrs={"class": "entry"})[x].text.replace(',', ' ').replace('\n', ' ').strip()
            RatingDate = soup.findAll(attrs={"class": "ratingDate"})[x].text.replace('Reviewed', ' ').replace('NEW',' ').replace(',', ' ').strip()
            Rating = soup.findAll(attrs={"class": "rating reviewItemInline"})[x].find(attrs={'class':'ui_bubble_rating'})['class']

            Record = Organization + "," + Address + "," + Reviewer + "," + ReviewTitle + "," + Review + "," + ReviewCount + "," + Location + "," + RatingDate + "," + Rating[len(Rating)-1]

            file.write(bytes(Record, encoding="ascii", errors='ignore')  + b"\n")

        link = soup.find_all(attrs={"class": "nav next taLnk "})
        print(Organization)
        if len(link) == 0:
            break
        else:
            urlparts = theurl.split('Reviews-')
            reviewNumb += 5
            soup = BeautifulSoup(urllib.request.urlopen(urlparts[0] + 'Reviews-or' + str(reviewNumb) + '-' +urlparts[1]),"html.parser")
            #print(link[0].get('href'))
            #Checker = link[0].get('href')[-7:]

file.close()
	# -- coding: utf-8 --
	"""
	Created on Fri Jul 28 23:56:40 2017

	@author: Dustin Fontaine
	"""

	# tripadvisor Scrapper - use this one to scrape hotels

	# importing libraries
	from bs4 import BeautifulSoup
	import urllib
	import os
	import urllib.request


	# creating CSV file to be used

	file = open(os.path.expanduser(r"TripAdviserReviews.csv"), "wb")

	file.write(
	b"Organization,Address,Reviewer,Review Title,Review,Review Count,Location,Rating Date,Rating" + b"\n")

	# List the first page of the reviews (ends with "#REVIEWS") - separate the websites with ,
	WebSites = ["https://www.tripadvisor.com.sg/Hotel_Review-g60763-d93623-Reviews-Trump_International_Hotel_and_Tower_New_York-New_York_City_New_York.html"]

	# looping through each site until it hits a break
	for theurl in WebSites:
	reviewNumb = 0
	thepage = urllib.request.urlopen(theurl)
	soup = BeautifulSoup(thepage, "html.parser")
	while True:
	Organization = soup.find(attrs={"class": "header heading fr"}).text.replace('"', ' ').replace('Review of',' ').strip()
	Address = soup.findAll(attrs={"class": "street-address"})[0].text.replace(',', '').replace('\n', '').strip()

	# Loop through each review on the page
	#for x in range(0, len(soup.findAll(attrs={"class": "avatarWrapper"}))):
	for x in range(5)
	try:
	Reviewer = soup.findAll(attrs={"class": "username mo"})[x].text
	except:
	Reviewer = "N/A"
	continue

	Reviewer = Reviewer.replace(',', ' ').replace('”', '').replace('“', '').replace('"', '').strip()
	ReviewCount = soup.findAll(attrs={"class": "badgetext"})[x].text.split(' ', 1)[0].strip()
	try:
	Location = soup.findAll(attrs={"class": "location"})[x].text.replace(',', ' ').strip()
	except:
	Location= 'Unknown'
	ReviewTitle = soup.findAll(attrs={"class": "quote"})[x].text.replace(',', ' ').replace('”', '').replace('“','').replace('"', '').replace('é', 'e').strip()
	Review = soup.findAll(attrs={"class": "entry"})[x].text.replace(',', ' ').replace('\n', ' ').strip()
	RatingDate = soup.findAll(attrs={"class": "ratingDate"})[x].text.replace('Reviewed', ' ').replace('NEW',' ').replace(',', ' ').strip()
	Rating = soup.findAll(attrs={"class": "rating reviewItemInline"})[x].find(attrs={'class':'ui_bubble_rating'})['class']

	Record = Organization + "," + Address + "," + Reviewer + "," + ReviewTitle + "," + Review + "," + ReviewCount + "," + Location + "," + RatingDate + "," + Rating[len(Rating)-1]

	file.write(bytes(Record, encoding="ascii", errors='ignore') + b"\n")

	link = soup.find_all(attrs={"class": "nav next taLnk "})
	print(Organization)
	if len(link) == 0:
	break
	else:
	urlparts = theurl.split('Reviews-')
	reviewNumb += 5
	soup = BeautifulSoup(urllib.request.urlopen(urlparts[0] + 'Reviews-or' + str(reviewNumb) + '-' +urlparts[1]),"html.parser")
	#print(link[0].get('href'))
	#Checker = link[0].get('href')[-7:]

	file.close()