Skip to content

Instantly share code, notes, and snippets.

@najibninaba
Last active September 28, 2017 08:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save najibninaba/fd22f805f91f745f980c78ddf739251b to your computer and use it in GitHub Desktop.
Save najibninaba/fd22f805f91f745f980c78ddf739251b to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 28 23:56:40 2017
@author: Dustin Fontaine
"""
# tripadvisor Scrapper - use this one to scrape hotels
# importing libraries
from bs4 import BeautifulSoup
import urllib
import os
import urllib.request
# creating CSV file to be used
file = open(os.path.expanduser(r"TripAdviserReviews.csv"), "wb")
file.write(
b"Organization,Address,Reviewer,Review Title,Review,Review Count,Location,Rating Date,Rating" + b"\n")
# List the first page of the reviews (ends with "#REVIEWS") - separate the websites with ,
WebSites = ["https://www.tripadvisor.com.sg/Hotel_Review-g60763-d93623-Reviews-Trump_International_Hotel_and_Tower_New_York-New_York_City_New_York.html"]
# looping through each site until it hits a break
for theurl in WebSites:
reviewNumb = 0
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
while True:
Organization = soup.find(attrs={"class": "header heading fr"}).text.replace('"', ' ').replace('Review of',' ').strip()
Address = soup.findAll(attrs={"class": "street-address"})[0].text.replace(',', '').replace('\n', '').strip()
# Loop through each review on the page
#for x in range(0, len(soup.findAll(attrs={"class": "avatarWrapper"}))):
for x in range(5)
try:
Reviewer = soup.findAll(attrs={"class": "username mo"})[x].text
except:
Reviewer = "N/A"
continue
Reviewer = Reviewer.replace(',', ' ').replace('”', '').replace('“', '').replace('"', '').strip()
ReviewCount = soup.findAll(attrs={"class": "badgetext"})[x].text.split(' ', 1)[0].strip()
try:
Location = soup.findAll(attrs={"class": "location"})[x].text.replace(',', ' ').strip()
except:
Location= 'Unknown'
ReviewTitle = soup.findAll(attrs={"class": "quote"})[x].text.replace(',', ' ').replace('”', '').replace('“','').replace('"', '').replace('é', 'e').strip()
Review = soup.findAll(attrs={"class": "entry"})[x].text.replace(',', ' ').replace('\n', ' ').strip()
RatingDate = soup.findAll(attrs={"class": "ratingDate"})[x].text.replace('Reviewed', ' ').replace('NEW',' ').replace(',', ' ').strip()
Rating = soup.findAll(attrs={"class": "rating reviewItemInline"})[x].find(attrs={'class':'ui_bubble_rating'})['class']
Record = Organization + "," + Address + "," + Reviewer + "," + ReviewTitle + "," + Review + "," + ReviewCount + "," + Location + "," + RatingDate + "," + Rating[len(Rating)-1]
file.write(bytes(Record, encoding="ascii", errors='ignore') + b"\n")
link = soup.find_all(attrs={"class": "nav next taLnk "})
print(Organization)
if len(link) == 0:
break
else:
urlparts = theurl.split('Reviews-')
reviewNumb += 5
soup = BeautifulSoup(urllib.request.urlopen(urlparts[0] + 'Reviews-or' + str(reviewNumb) + '-' +urlparts[1]),"html.parser")
#print(link[0].get('href'))
#Checker = link[0].get('href')[-7:]
file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment