besimgh/Web scraping TripAdvisor with Python

## Web scraping TripAdvisor with Python
#tripadvisor Scrappera

#importing libraries
from bs4 import BeautifulSoup
import urllib, csv, os, datetime, urllib.request, re, sys

#creating CSV file to be used
try:
	file = open(os.path.expanduser(r"~/Desktop/TripAdviser Reviews.csv"), "wb")
	file.write(b"Organization" + b"," + b"Address" + b"," + b"Reviewer" + b"," + b"Review Title" + b"," + b"Review" + b"," + b"Review Count" + b"," + b"Help Count"
                   + b"," + b"Attraction Count" + b"," + b"Restaurant Count" + b"," + b"Hotel Count" + b","  + b"Location" + b"," + b"Rating Date"  + b"," + b"Rating"+ b"\n")
except:
	os.remove(os.path.expanduser(r"~/Desktop/TripAdviser.csv"))
	file = open(os.path.expanduser(r"~/Desktop/TripAdviser.csv"), "wb")
	file.write(b"Organization" + b"," + b"Address" + b"," + b"Reviewer" + b"," + b"Review Title" + b"," + b"Review" + b"," + b"Review Count" + b"," + b"Help Count"
                   + b"," + b"Attraction Count" + b"," + b"Restaurant Count" + b"," + b"Hotel Count"  + b"," + b"Location" + b"," + b"Rating Date"  + b"," + b"Rating" + b"\n")

#List the first page of the reviews (ends with "#REVIEWS") - separate the websites with ,
WebSites=["http://www.tripadvisor.ca/Hotel_Review-g190479-d3587956-Reviews-The_Thief-Oslo_Eastern_Norway.html#REVIEWS"]

#looping through each site until it hits a break
for theurl in WebSites:
    thepage = urllib.request.urlopen(theurl)
    soup=BeautifulSoup(thepage, "html.parser")
    while True:

#extract the help count, restaurant review count, attraction review count and hotel review count
        a=b=0
        helpcountarray=restaurantarray=attractionarray=hotelarray=""

        for profile in soup.findAll(attrs={"class":"memberBadging g10n"}):
            image=profile.text.replace("\n","|||||").strip()
            if image.find("helpful vote")>0:
                counter=image.split("helpful vote",1)[0].split("|",1)[1][-4:].replace("|","").strip()
                if len(helpcountarray)==0:
                    helpcountarray=[counter]
                else:
                    helpcountarray.append(counter)
            elif image.find("helpful vote")<0:
                if len(helpcountarray)==0:
                    helpcountarray=["0"]
                else:
                    helpcountarray.append("0")

            if image.find("attraction")>0:
                counter=image.split("attraction",1)[0].split("|",1)[1][-4:].replace("|","").strip()
                if len(attractionarray)==0:
                    attractionarray=[counter]
                else:
                    attractionarray.append(counter)
            elif image.find("attraction")<0:
                if len(attractionarray)==0:
                    attractionarray=["0"]
                else:
                    attractionarray.append("0")

            if image.find("restaurant")>0:
                counter=image.split("restaurant",1)[0].split("|",1)[1][-4:].replace("|","").strip()
                if len(restaurantarray)==0:
                    restaurantarray=[counter]
                else:
                    restaurantarray.append(counter)
            elif image.find("restaurant")<0:
                if len(restaurantarray)==0:
                    restaurantarray=["0"]
                else:
                    restaurantarray.append("0")

            if image.find("hotel")>0:
                counter=image.split("hotel",1)[0].split("|",1)[1][-4:].replace("|","").strip()
                if len(hotelarray)==0:
                    hotelarray=[counter]
                else:
                    hotelarray.append(counter)
            elif image.find("hotel")<0:
                if len(hotelarray)==0:
                    hotelarray=["0"]
                else:
                    hotelarray.append("0")

#extract the rating count for each user review
        altarray=""
        for rating in soup.findAll(attrs={"class":"rating reviewItemInline"}):
            alt=rating.find('img',alt=True)['alt']
            if alt[-5:]=='stars':
                if len(altarray)==0:
                    altarray=[alt]
                else:
                    altarray.append(alt)

        Organization = soup.find(attrs={"class":"heading_name"}).text.replace('"',' ').replace('Review of',' ').strip()
        Address = soup.findAll(attrs={"class":"format_address"})[0].text.replace(',','').replace('\n','').strip()

#Loop through each review on the page
        for x in range(0,len(hotelarray)):
            try:
                Reviewer = soup.findAll(attrs={"class":"username mo"})[x].text
            except:
                Reviewer = "N/A"
                continue

            Reviewer = Reviewer.replace(',',' ').replace('”', '').replace('“', '').replace('"', '').strip()
            ReviewCount = soup.findAll(attrs={"class":"reviewerBadge badge"})[x].text.split(' ',1)[0].strip()
            Location = soup.findAll(attrs={"class":"location"})[x].text.replace(',',' ').strip()
            ReviewTitle = soup.findAll(attrs={"class":"quote"})[x].text.replace(',',' ').replace('”', '').replace('“', '').replace('"', '').replace('é', 'e').strip()
            Review = soup.findAll(attrs={"class":"entry"})[x].text.replace(',',' ').replace('\n',' ').strip()
            RatingDate = soup.findAll(attrs={"class":"ratingDate"})[x].text.replace('Reviewed',' ').replace('NEW',' ').replace(',',' ').strip()

            Rating = altarray[x][:1]
            HelpCount = helpcountarray[x]
            AttractionCount = attractionarray[x]
            Restaurant=restaurantarray[x]
            Hotel=hotelarray[x]
            file.write(bytes(Organization, encoding="ascii",errors='ignore') +b"," + bytes(Address, encoding="ascii",errors='ignore') +b"," +
                       bytes(Reviewer, encoding="ascii",errors='ignore') +b"," + bytes(ReviewTitle, encoding="ascii",errors='ignore') +b"," +
                       bytes(Review, encoding="ascii",errors='ignore') +b"," + bytes(ReviewCount, encoding="ascii",errors='ignore') +b"," +
                       bytes(HelpCount, encoding="ascii",errors='ignore') +b"," + bytes(AttractionCount, encoding="ascii",errors='ignore') +b","  +
                       bytes(Restaurant, encoding="ascii",errors='ignore') +b","  + bytes(Hotel, encoding="ascii",errors='ignore') +b"," +
                       bytes(Location, encoding="ascii",errors='ignore') +b"," + bytes(RatingDate, encoding="ascii",errors='ignore') +b"," +
                       bytes(Rating, encoding="ascii",errors='ignore')   +b"\n" )

        link=soup.find_all(attrs={"class":"nav next rndBtn ui_button primary taLnk"})
        print(Organization)
        if len(link)==0:
            break
        else:
            soup=BeautifulSoup(urllib.request.urlopen("http://www.tripadvisor.com" + link[0].get('href')))
            print(link[0].get('href'))

file.close()
	#tripadvisor Scrappera

	#importing libraries
	from bs4 import BeautifulSoup
	import urllib, csv, os, datetime, urllib.request, re, sys

	#creating CSV file to be used
	try:
	file = open(os.path.expanduser(r"~/Desktop/TripAdviser Reviews.csv"), "wb")
	file.write(b"Organization" + b"," + b"Address" + b"," + b"Reviewer" + b"," + b"Review Title" + b"," + b"Review" + b"," + b"Review Count" + b"," + b"Help Count"
	+ b"," + b"Attraction Count" + b"," + b"Restaurant Count" + b"," + b"Hotel Count" + b"," + b"Location" + b"," + b"Rating Date" + b"," + b"Rating"+ b"\n")
	except:
	os.remove(os.path.expanduser(r"~/Desktop/TripAdviser.csv"))
	file = open(os.path.expanduser(r"~/Desktop/TripAdviser.csv"), "wb")
	file.write(b"Organization" + b"," + b"Address" + b"," + b"Reviewer" + b"," + b"Review Title" + b"," + b"Review" + b"," + b"Review Count" + b"," + b"Help Count"
	+ b"," + b"Attraction Count" + b"," + b"Restaurant Count" + b"," + b"Hotel Count" + b"," + b"Location" + b"," + b"Rating Date" + b"," + b"Rating" + b"\n")

	#List the first page of the reviews (ends with "#REVIEWS") - separate the websites with ,
	WebSites=["http://www.tripadvisor.ca/Hotel_Review-g190479-d3587956-Reviews-The_Thief-Oslo_Eastern_Norway.html#REVIEWS"]

	#looping through each site until it hits a break
	for theurl in WebSites:
	thepage = urllib.request.urlopen(theurl)
	soup=BeautifulSoup(thepage, "html.parser")
	while True:

	#extract the help count, restaurant review count, attraction review count and hotel review count
	a=b=0
	helpcountarray=restaurantarray=attractionarray=hotelarray=""

	for profile in soup.findAll(attrs={"class":"memberBadging g10n"}):
	image=profile.text.replace("\n","\|\|\|\|\|").strip()
	if image.find("helpful vote")>0:
	counter=image.split("helpful vote",1)[0].split("\|",1)[1][-4:].replace("\|","").strip()
	if len(helpcountarray)==0:
	helpcountarray=[counter]
	else:
	helpcountarray.append(counter)
	elif image.find("helpful vote")<0:
	if len(helpcountarray)==0:
	helpcountarray=["0"]
	else:
	helpcountarray.append("0")

	if image.find("attraction")>0:
	counter=image.split("attraction",1)[0].split("\|",1)[1][-4:].replace("\|","").strip()
	if len(attractionarray)==0:
	attractionarray=[counter]
	else:
	attractionarray.append(counter)
	elif image.find("attraction")<0:
	if len(attractionarray)==0:
	attractionarray=["0"]
	else:
	attractionarray.append("0")

	if image.find("restaurant")>0:
	counter=image.split("restaurant",1)[0].split("\|",1)[1][-4:].replace("\|","").strip()
	if len(restaurantarray)==0:
	restaurantarray=[counter]
	else:
	restaurantarray.append(counter)
	elif image.find("restaurant")<0:
	if len(restaurantarray)==0:
	restaurantarray=["0"]
	else:
	restaurantarray.append("0")

	if image.find("hotel")>0:
	counter=image.split("hotel",1)[0].split("\|",1)[1][-4:].replace("\|","").strip()
	if len(hotelarray)==0:
	hotelarray=[counter]
	else:
	hotelarray.append(counter)
	elif image.find("hotel")<0:
	if len(hotelarray)==0:
	hotelarray=["0"]
	else:
	hotelarray.append("0")

	#extract the rating count for each user review
	altarray=""
	for rating in soup.findAll(attrs={"class":"rating reviewItemInline"}):
	alt=rating.find('img',alt=True)['alt']
	if alt[-5:]=='stars':
	if len(altarray)==0:
	altarray=[alt]
	else:
	altarray.append(alt)

	Organization = soup.find(attrs={"class":"heading_name"}).text.replace('"',' ').replace('Review of',' ').strip()
	Address = soup.findAll(attrs={"class":"format_address"})[0].text.replace(',','').replace('\n','').strip()

	#Loop through each review on the page
	for x in range(0,len(hotelarray)):
	try:
	Reviewer = soup.findAll(attrs={"class":"username mo"})[x].text
	except:
	Reviewer = "N/A"
	continue

	Reviewer = Reviewer.replace(',',' ').replace('”', '').replace('“', '').replace('"', '').strip()
	ReviewCount = soup.findAll(attrs={"class":"reviewerBadge badge"})[x].text.split(' ',1)[0].strip()
	Location = soup.findAll(attrs={"class":"location"})[x].text.replace(',',' ').strip()
	ReviewTitle = soup.findAll(attrs={"class":"quote"})[x].text.replace(',',' ').replace('”', '').replace('“', '').replace('"', '').replace('é', 'e').strip()
	Review = soup.findAll(attrs={"class":"entry"})[x].text.replace(',',' ').replace('\n',' ').strip()
	RatingDate = soup.findAll(attrs={"class":"ratingDate"})[x].text.replace('Reviewed',' ').replace('NEW',' ').replace(',',' ').strip()

	Rating = altarray[x][:1]
	HelpCount = helpcountarray[x]
	AttractionCount = attractionarray[x]
	Restaurant=restaurantarray[x]
	Hotel=hotelarray[x]
	file.write(bytes(Organization, encoding="ascii",errors='ignore') +b"," + bytes(Address, encoding="ascii",errors='ignore') +b"," +
	bytes(Reviewer, encoding="ascii",errors='ignore') +b"," + bytes(ReviewTitle, encoding="ascii",errors='ignore') +b"," +
	bytes(Review, encoding="ascii",errors='ignore') +b"," + bytes(ReviewCount, encoding="ascii",errors='ignore') +b"," +
	bytes(HelpCount, encoding="ascii",errors='ignore') +b"," + bytes(AttractionCount, encoding="ascii",errors='ignore') +b"," +
	bytes(Restaurant, encoding="ascii",errors='ignore') +b"," + bytes(Hotel, encoding="ascii",errors='ignore') +b"," +
	bytes(Location, encoding="ascii",errors='ignore') +b"," + bytes(RatingDate, encoding="ascii",errors='ignore') +b"," +
	bytes(Rating, encoding="ascii",errors='ignore') +b"\n" )

	link=soup.find_all(attrs={"class":"nav next rndBtn ui_button primary taLnk"})
	print(Organization)
	if len(link)==0:
	break
	else:
	soup=BeautifulSoup(urllib.request.urlopen("http://www.tripadvisor.com" + link[0].get('href')))
	print(link[0].get('href'))

	file.close()