Created
January 6, 2016 14:03
-
-
Save besimgh/51449fb943180d9d01ea to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#tripadvisor Scrappera | |
#importing libraries | |
from bs4 import BeautifulSoup | |
import urllib, csv, os, datetime, urllib.request, re, sys | |
#creating CSV file to be used | |
try: | |
file = open(os.path.expanduser(r"~/Desktop/TripAdviser Reviews.csv"), "wb") | |
file.write(b"Organization" + b"," + b"Address" + b"," + b"Reviewer" + b"," + b"Review Title" + b"," + b"Review" + b"," + b"Review Count" + b"," + b"Help Count" | |
+ b"," + b"Attraction Count" + b"," + b"Restaurant Count" + b"," + b"Hotel Count" + b"," + b"Location" + b"," + b"Rating Date" + b"," + b"Rating"+ b"\n") | |
except: | |
os.remove(os.path.expanduser(r"~/Desktop/TripAdviser.csv")) | |
file = open(os.path.expanduser(r"~/Desktop/TripAdviser.csv"), "wb") | |
file.write(b"Organization" + b"," + b"Address" + b"," + b"Reviewer" + b"," + b"Review Title" + b"," + b"Review" + b"," + b"Review Count" + b"," + b"Help Count" | |
+ b"," + b"Attraction Count" + b"," + b"Restaurant Count" + b"," + b"Hotel Count" + b"," + b"Location" + b"," + b"Rating Date" + b"," + b"Rating" + b"\n") | |
#List the first page of the reviews (ends with "#REVIEWS") - separate the websites with , | |
WebSites=["http://www.tripadvisor.ca/Hotel_Review-g190479-d3587956-Reviews-The_Thief-Oslo_Eastern_Norway.html#REVIEWS"] | |
#looping through each site until it hits a break | |
for theurl in WebSites: | |
thepage = urllib.request.urlopen(theurl) | |
soup=BeautifulSoup(thepage, "html.parser") | |
while True: | |
#extract the help count, restaurant review count, attraction review count and hotel review count | |
a=b=0 | |
helpcountarray=restaurantarray=attractionarray=hotelarray="" | |
for profile in soup.findAll(attrs={"class":"memberBadging g10n"}): | |
image=profile.text.replace("\n","|||||").strip() | |
if image.find("helpful vote")>0: | |
counter=image.split("helpful vote",1)[0].split("|",1)[1][-4:].replace("|","").strip() | |
if len(helpcountarray)==0: | |
helpcountarray=[counter] | |
else: | |
helpcountarray.append(counter) | |
elif image.find("helpful vote")<0: | |
if len(helpcountarray)==0: | |
helpcountarray=["0"] | |
else: | |
helpcountarray.append("0") | |
if image.find("attraction")>0: | |
counter=image.split("attraction",1)[0].split("|",1)[1][-4:].replace("|","").strip() | |
if len(attractionarray)==0: | |
attractionarray=[counter] | |
else: | |
attractionarray.append(counter) | |
elif image.find("attraction")<0: | |
if len(attractionarray)==0: | |
attractionarray=["0"] | |
else: | |
attractionarray.append("0") | |
if image.find("restaurant")>0: | |
counter=image.split("restaurant",1)[0].split("|",1)[1][-4:].replace("|","").strip() | |
if len(restaurantarray)==0: | |
restaurantarray=[counter] | |
else: | |
restaurantarray.append(counter) | |
elif image.find("restaurant")<0: | |
if len(restaurantarray)==0: | |
restaurantarray=["0"] | |
else: | |
restaurantarray.append("0") | |
if image.find("hotel")>0: | |
counter=image.split("hotel",1)[0].split("|",1)[1][-4:].replace("|","").strip() | |
if len(hotelarray)==0: | |
hotelarray=[counter] | |
else: | |
hotelarray.append(counter) | |
elif image.find("hotel")<0: | |
if len(hotelarray)==0: | |
hotelarray=["0"] | |
else: | |
hotelarray.append("0") | |
#extract the rating count for each user review | |
altarray="" | |
for rating in soup.findAll(attrs={"class":"rating reviewItemInline"}): | |
alt=rating.find('img',alt=True)['alt'] | |
if alt[-5:]=='stars': | |
if len(altarray)==0: | |
altarray=[alt] | |
else: | |
altarray.append(alt) | |
Organization = soup.find(attrs={"class":"heading_name"}).text.replace('"',' ').replace('Review of',' ').strip() | |
Address = soup.findAll(attrs={"class":"format_address"})[0].text.replace(',','').replace('\n','').strip() | |
#Loop through each review on the page | |
for x in range(0,len(hotelarray)): | |
try: | |
Reviewer = soup.findAll(attrs={"class":"username mo"})[x].text | |
except: | |
Reviewer = "N/A" | |
continue | |
Reviewer = Reviewer.replace(',',' ').replace('”', '').replace('“', '').replace('"', '').strip() | |
ReviewCount = soup.findAll(attrs={"class":"reviewerBadge badge"})[x].text.split(' ',1)[0].strip() | |
Location = soup.findAll(attrs={"class":"location"})[x].text.replace(',',' ').strip() | |
ReviewTitle = soup.findAll(attrs={"class":"quote"})[x].text.replace(',',' ').replace('”', '').replace('“', '').replace('"', '').replace('é', 'e').strip() | |
Review = soup.findAll(attrs={"class":"entry"})[x].text.replace(',',' ').replace('\n',' ').strip() | |
RatingDate = soup.findAll(attrs={"class":"ratingDate"})[x].text.replace('Reviewed',' ').replace('NEW',' ').replace(',',' ').strip() | |
Rating = altarray[x][:1] | |
HelpCount = helpcountarray[x] | |
AttractionCount = attractionarray[x] | |
Restaurant=restaurantarray[x] | |
Hotel=hotelarray[x] | |
file.write(bytes(Organization, encoding="ascii",errors='ignore') +b"," + bytes(Address, encoding="ascii",errors='ignore') +b"," + | |
bytes(Reviewer, encoding="ascii",errors='ignore') +b"," + bytes(ReviewTitle, encoding="ascii",errors='ignore') +b"," + | |
bytes(Review, encoding="ascii",errors='ignore') +b"," + bytes(ReviewCount, encoding="ascii",errors='ignore') +b"," + | |
bytes(HelpCount, encoding="ascii",errors='ignore') +b"," + bytes(AttractionCount, encoding="ascii",errors='ignore') +b"," + | |
bytes(Restaurant, encoding="ascii",errors='ignore') +b"," + bytes(Hotel, encoding="ascii",errors='ignore') +b"," + | |
bytes(Location, encoding="ascii",errors='ignore') +b"," + bytes(RatingDate, encoding="ascii",errors='ignore') +b"," + | |
bytes(Rating, encoding="ascii",errors='ignore') +b"\n" ) | |
link=soup.find_all(attrs={"class":"nav next rndBtn ui_button primary taLnk"}) | |
print(Organization) | |
if len(link)==0: | |
break | |
else: | |
soup=BeautifulSoup(urllib.request.urlopen("http://www.tripadvisor.com" + link[0].get('href'))) | |
print(link[0].get('href')) | |
file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment