Skip to content

Instantly share code, notes, and snippets.

@Chaitali20-gh
Created December 30, 2020 22:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Chaitali20-gh/85ae58ea53ca9aba8ad703d2e5bceccc to your computer and use it in GitHub Desktop.
Save Chaitali20-gh/85ae58ea53ca9aba8ad703d2e5bceccc to your computer and use it in GitHub Desktop.
Trip Advisor Web Scrapping - Analysis on Miami Hotels
import scrapy
class TripadvisorItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
hotel_name = scrapy.Field()
start_price = scrapy.Field()
hotel_rating = scrapy.Field()
hotel_reviews = scrapy.Field()
hotel_amenities = scrapy.Field()
pop_review_words = scrapy.Field()
nearby_restaurant = scrapy.Field()
walkable_rating = scrapy.Field()
loc_attraction = scrapy.Field()
from scrapy.exporters import CsvItemExporter
class WriteItemPipeline(object):
def __init__(self):
self.filename = 'tripadvisor_miami_hotels.csv'
def open_spider(self, spider):
self.csvfile = open(self.filename, 'wb')
self.exporter = CsvItemExporter(self.csvfile)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.csvfile.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
# Scrapy settings for tripadvisor project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'tripadvisor'
SPIDER_MODULES = ['tripadvisor.spiders']
NEWSPIDER_MODULE = 'tripadvisor.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tripadvisor (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'tripadvisor.middlewares.TripadvisorSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'tripadvisor.middlewares.TripadvisorDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'tripadvisor.pipelines.WriteItemPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
from scrapy import Spider, Request
from tripadvisor.items import TripadvisorItem
class TripadvisorSpider(Spider):
name = 'tripadvisor_spider'
allowed_domains = ['www.tripadvisor.com']
start_urls = ['https://www.tripadvisor.com/Hotels-g34439-Miami_Beach_Florida-Hotels.html']
def parse(self, response):
#pg_num = response.xpath('//a[@class="pageNum "]/text()').extract()
pg_num = response.xpath('//a[@class="pageNum last "]/text()').extract()
result_urls = [f'https://www.tripadvisor.com/Hotels-g34439-oa{i*30}-Miami_Beach_Florida-Hotels.html' for i in range (0,14)]
for url in result_urls:
yield Request(url=url, callback=self.parse_result_page)
def parse_result_page(self,response):
# This function parses the search result page.
detail_urls = response.xpath('//div[@class="listing_title"]/a/@href').extract()
for url in ['https://www.tripadvisor.com{}'.format(x) for x in detail_urls]:
yield Request(url=url, callback=self.parse_detail_page)
def parse_detail_page(self,response):
hotel_name = response.xpath('//h1[@class="_1mTlpMC3"]/text()').extract()
start_price = response.xpath('//div[@data-provider!=""]/@data-pernight').extract_first()
hotel_rating = response.xpath('//span[@class="_3cjYfwwQ"]/text()').extract()
hotel_reviews = response.xpath('//span[@class="_33O9dg0j"]/text()').extract()
hotel_reviews = int(hotel_reviews[0].replace("reviews","").replace(",",""))
hotel_amenities = response.xpath('//div[@class="_2rdvbNSg"]/text()').extract()
pop_review_words = response.xpath('//button[@class="ui_button secondary small H5_EAgqY"]/text()').extract()
nearby_restaurant = response.xpath('//span[@class="oPMurIUj TrfXbt7b"]/text()').extract()
walkable_rating = response.xpath('//span[@class="oPMurIUj _1iwDIdby"]/text()').extract()
loc_attraction = response.xpath('//span[@class="oPMurIUj _1WE0iyL_"]/text()').extract()
item = TripadvisorItem()
item['hotel_name'] = hotel_name
item['start_price'] = start_price
item['hotel_rating'] = hotel_rating
item['hotel_reviews'] = hotel_reviews
item['hotel_amenities'] = hotel_amenities
item['pop_review_words'] = pop_review_words
item['nearby_restaurant'] = nearby_restaurant
item['walkable_rating'] = walkable_rating
item['loc_attraction'] = loc_attraction
yield item
import pandas as pd
hotel_orig_data = pd.read_csv('tripadvisor_miami_hotels.csv')
hotel_orig_data
%matplotlib inline
from matplotlib import pyplot as plt
plt.style.use('ggplot')
import seaborn as sns
import numpy as np
np.sum(hotel_orig_data.isnull())
hotel_data = hotel_orig_data.copy()
hotel_data = hotel_data.dropna(subset=['start_price'])
hotel_data.describe
hotel_data['pop_review_words'] = hotel_data['pop_review_words'].fillna("Others")
hotel_data['hotel_amenities'] = hotel_data['hotel_amenities'].fillna("Others")
#Hotel Start_Price Plot
plt.hist(hotel_data['start_price'])
# Data Manipulation
hotel_data.loc[hotel_data['hotel_name'].str.contains('South'),'Hotel_loc'] = 'South Beach'
hotel_data.loc[~hotel_data['hotel_name'].str.contains('South'),'Hotel_loc'] = 'Miami Beach'
hotel_data.loc[hotel_data['hotel_amenities'].str.contains(',Beach,'), 'Beach_front'] = "Y"
hotel_data.loc[~hotel_data['hotel_amenities'].str.contains(',Beach,'), 'Beach_front'] = "N"
hotel_data.loc[hotel_data['hotel_name'].str.contains('Marriott'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Hilton'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Best Western'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Hyatt'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Hampton Inn'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Holiday Inn'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Westgate'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Ritz'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Crowne Plaza'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Kimpton'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Lexington'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Pestana'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Four Points'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['pop_review_words'].str.contains('boutique'),'Hotel_Grp'] = 'Boutique Hotel'
hotel_data['Hotel_Grp'] = hotel_data['Hotel_Grp'].fillna("Local Hotel")
#hotel_grp vs Starting_price
plt.figure(figsize=(12,6))
sns.boxplot(x='Hotel_Grp', y='start_price', data=hotel_data)
# Bar chart for hotel_grp count vs hotel rating
plt.figure(figsize=(12,6))
hotel_data.groupby('Hotel_Grp')['hotel_rating'].median().sort_values(ascending=False).plot.bar(color='b')
# Bar chart for hotel_grp count vs hotel reviews
plt.figure(figsize=(12,6))
hotel_data.groupby('Hotel_Grp')['hotel_reviews'].median().sort_values(ascending=False).plot.bar(color='b')
# Bar chart for hotel_grp count vs hotel location
plt.figure(figsize=(12,6))
hotel_data.groupby('Hotel_loc')['start_price'].median().sort_values(ascending=False).plot.bar(color='b')
# NLP for boutique hotels
from textblob import TextBlob
hotel_boutique = hotel_data.loc[(hotel_data.Hotel_Grp == "Boutique Hotel") & (hotel_data.hotel_reviews > 100)]
sample_size = 20
def sentiment_func(x):
sentiment = TextBlob(x['pop_review_words'])
x['polarity'] = sentiment.polarity
x['subjectivity'] = sentiment.subjectivity
return x
sample = hotel_boutique.sample(sample_size).apply(sentiment_func, axis=1)
sample.plot.scatter('hotel_reviews', 'polarity')
# NLP for local hotels
from textblob import TextBlob
hotel_others = hotel_data.loc[(hotel_data.Hotel_Grp == "Local Hotel") & (hotel_data.hotel_reviews > 100)]
sample_size = 100
def sentiment_func(x):
sentiment = TextBlob(x['pop_review_words'])
x['polarity'] = sentiment.polarity
x['subjectivity'] = sentiment.subjectivity
return x
sample = hotel_others.sample(sample_size).apply(sentiment_func, axis=1)
sample.plot.scatter('hotel_reviews', 'polarity')
# NLP for Chain hotels
from textblob import TextBlob
hotel_chains = hotel_data.loc[(hotel_data.Hotel_Grp == "Chain Hotel") & (hotel_data.hotel_reviews > 100)]
hotel_chains
sample_size = 15
def sentiment_func(x):
sentiment = TextBlob(x['pop_review_words'])
x['polarity'] = sentiment.polarity
x['subjectivity'] = sentiment.subjectivity
return x
sample = hotel_chains.sample(sample_size).apply(sentiment_func, axis=1)
sample.plot.scatter('hotel_reviews', 'polarity')
#wordcloud generation for Hotel_Grp
from wordcloud import WordCloud
#generate word cloud for 'Local' hotel group
wc = WordCloud(background_color="white", max_words=2000, width=800, height=400)
# generate word cloud
wc.generate(' '.join(hotel_others['pop_review_words']))
plt.figure(figsize=(12, 6))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
#generate word cloud for 'Chains' hotel group
wc = WordCloud(background_color="white", max_words=2000, width=800, height=400)
# generate word cloud
wc.generate(' '.join(hotel_chains['pop_review_words']))
plt.figure(figsize=(12, 6))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
#generate word cloud for 'boutique' hotel group
wc = WordCloud(background_color="white", max_words=2000, width=800, height=400)
# generate word cloud
wc.generate(' '.join(hotel_boutique['pop_review_words']))
plt.figure(figsize=(12, 6))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
PriceGrid = sns.FacetGrid(hotel_data, col='Hotel_Grp', hue="Hotel_Grp", palette="Set1", height=4)
PriceGrid.map(sns.distplot, "start_price")
hotelGrid = sns.FacetGrid(hotel_data, row='Hotel_Grp', col='Hotel_loc', hue='Beach_front', palette="Set1", height=5)
hotelGrid.map(sns.regplot,'start_price','hotel_rating')
hotelGrid.add_legend()
from wordcloud import WordCloud
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment