-
-
Save Chaitali20-gh/85ae58ea53ca9aba8ad703d2e5bceccc to your computer and use it in GitHub Desktop.
Trip Advisor Web Scrapping - Analysis on Miami Hotels
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
class TripadvisorItem(scrapy.Item): | |
# define the fields for your item here like: | |
# name = scrapy.Field() | |
hotel_name = scrapy.Field() | |
start_price = scrapy.Field() | |
hotel_rating = scrapy.Field() | |
hotel_reviews = scrapy.Field() | |
hotel_amenities = scrapy.Field() | |
pop_review_words = scrapy.Field() | |
nearby_restaurant = scrapy.Field() | |
walkable_rating = scrapy.Field() | |
loc_attraction = scrapy.Field() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.exporters import CsvItemExporter | |
class WriteItemPipeline(object): | |
def __init__(self): | |
self.filename = 'tripadvisor_miami_hotels.csv' | |
def open_spider(self, spider): | |
self.csvfile = open(self.filename, 'wb') | |
self.exporter = CsvItemExporter(self.csvfile) | |
self.exporter.start_exporting() | |
def close_spider(self, spider): | |
self.exporter.finish_exporting() | |
self.csvfile.close() | |
def process_item(self, item, spider): | |
self.exporter.export_item(item) | |
return item |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Scrapy settings for tripadvisor project | |
# | |
# For simplicity, this file contains only settings considered important or | |
# commonly used. You can find more settings consulting the documentation: | |
# | |
# https://docs.scrapy.org/en/latest/topics/settings.html | |
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html | |
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html | |
BOT_NAME = 'tripadvisor' | |
SPIDER_MODULES = ['tripadvisor.spiders'] | |
NEWSPIDER_MODULE = 'tripadvisor.spiders' | |
# Crawl responsibly by identifying yourself (and your website) on the user-agent | |
#USER_AGENT = 'tripadvisor (+http://www.yourdomain.com)' | |
# Obey robots.txt rules | |
ROBOTSTXT_OBEY = False | |
# Configure maximum concurrent requests performed by Scrapy (default: 16) | |
#CONCURRENT_REQUESTS = 32 | |
# Configure a delay for requests for the same website (default: 0) | |
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay | |
# See also autothrottle settings and docs | |
DOWNLOAD_DELAY = 3 | |
# The download delay setting will honor only one of: | |
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 | |
#CONCURRENT_REQUESTS_PER_IP = 16 | |
# Disable cookies (enabled by default) | |
#COOKIES_ENABLED = False | |
# Disable Telnet Console (enabled by default) | |
#TELNETCONSOLE_ENABLED = False | |
# Override the default request headers: | |
#DEFAULT_REQUEST_HEADERS = { | |
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
# 'Accept-Language': 'en', | |
#} | |
# Enable or disable spider middlewares | |
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html | |
#SPIDER_MIDDLEWARES = { | |
# 'tripadvisor.middlewares.TripadvisorSpiderMiddleware': 543, | |
#} | |
# Enable or disable downloader middlewares | |
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html | |
#DOWNLOADER_MIDDLEWARES = { | |
# 'tripadvisor.middlewares.TripadvisorDownloaderMiddleware': 543, | |
#} | |
# Enable or disable extensions | |
# See https://docs.scrapy.org/en/latest/topics/extensions.html | |
#EXTENSIONS = { | |
# 'scrapy.extensions.telnet.TelnetConsole': None, | |
#} | |
# Configure item pipelines | |
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html | |
ITEM_PIPELINES = { | |
'tripadvisor.pipelines.WriteItemPipeline': 300, | |
} | |
# Enable and configure the AutoThrottle extension (disabled by default) | |
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html | |
#AUTOTHROTTLE_ENABLED = True | |
# The initial download delay | |
#AUTOTHROTTLE_START_DELAY = 5 | |
# The maximum download delay to be set in case of high latencies | |
#AUTOTHROTTLE_MAX_DELAY = 60 | |
# The average number of requests Scrapy should be sending in parallel to | |
# each remote server | |
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 | |
# Enable showing throttling stats for every response received: | |
#AUTOTHROTTLE_DEBUG = False | |
# Enable and configure HTTP caching (disabled by default) | |
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings | |
#HTTPCACHE_ENABLED = True | |
#HTTPCACHE_EXPIRATION_SECS = 0 | |
#HTTPCACHE_DIR = 'httpcache' | |
#HTTPCACHE_IGNORE_HTTP_CODES = [] | |
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy import Spider, Request | |
from tripadvisor.items import TripadvisorItem | |
class TripadvisorSpider(Spider): | |
name = 'tripadvisor_spider' | |
allowed_domains = ['www.tripadvisor.com'] | |
start_urls = ['https://www.tripadvisor.com/Hotels-g34439-Miami_Beach_Florida-Hotels.html'] | |
def parse(self, response): | |
#pg_num = response.xpath('//a[@class="pageNum "]/text()').extract() | |
pg_num = response.xpath('//a[@class="pageNum last "]/text()').extract() | |
result_urls = [f'https://www.tripadvisor.com/Hotels-g34439-oa{i*30}-Miami_Beach_Florida-Hotels.html' for i in range (0,14)] | |
for url in result_urls: | |
yield Request(url=url, callback=self.parse_result_page) | |
def parse_result_page(self,response): | |
# This function parses the search result page. | |
detail_urls = response.xpath('//div[@class="listing_title"]/a/@href').extract() | |
for url in ['https://www.tripadvisor.com{}'.format(x) for x in detail_urls]: | |
yield Request(url=url, callback=self.parse_detail_page) | |
def parse_detail_page(self,response): | |
hotel_name = response.xpath('//h1[@class="_1mTlpMC3"]/text()').extract() | |
start_price = response.xpath('//div[@data-provider!=""]/@data-pernight').extract_first() | |
hotel_rating = response.xpath('//span[@class="_3cjYfwwQ"]/text()').extract() | |
hotel_reviews = response.xpath('//span[@class="_33O9dg0j"]/text()').extract() | |
hotel_reviews = int(hotel_reviews[0].replace("reviews","").replace(",","")) | |
hotel_amenities = response.xpath('//div[@class="_2rdvbNSg"]/text()').extract() | |
pop_review_words = response.xpath('//button[@class="ui_button secondary small H5_EAgqY"]/text()').extract() | |
nearby_restaurant = response.xpath('//span[@class="oPMurIUj TrfXbt7b"]/text()').extract() | |
walkable_rating = response.xpath('//span[@class="oPMurIUj _1iwDIdby"]/text()').extract() | |
loc_attraction = response.xpath('//span[@class="oPMurIUj _1WE0iyL_"]/text()').extract() | |
item = TripadvisorItem() | |
item['hotel_name'] = hotel_name | |
item['start_price'] = start_price | |
item['hotel_rating'] = hotel_rating | |
item['hotel_reviews'] = hotel_reviews | |
item['hotel_amenities'] = hotel_amenities | |
item['pop_review_words'] = pop_review_words | |
item['nearby_restaurant'] = nearby_restaurant | |
item['walkable_rating'] = walkable_rating | |
item['loc_attraction'] = loc_attraction | |
yield item |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
hotel_orig_data = pd.read_csv('tripadvisor_miami_hotels.csv') | |
hotel_orig_data | |
%matplotlib inline | |
from matplotlib import pyplot as plt | |
plt.style.use('ggplot') | |
import seaborn as sns | |
import numpy as np | |
np.sum(hotel_orig_data.isnull()) | |
hotel_data = hotel_orig_data.copy() | |
hotel_data = hotel_data.dropna(subset=['start_price']) | |
hotel_data.describe | |
hotel_data['pop_review_words'] = hotel_data['pop_review_words'].fillna("Others") | |
hotel_data['hotel_amenities'] = hotel_data['hotel_amenities'].fillna("Others") | |
#Hotel Start_Price Plot | |
plt.hist(hotel_data['start_price']) | |
# Data Manipulation | |
hotel_data.loc[hotel_data['hotel_name'].str.contains('South'),'Hotel_loc'] = 'South Beach' | |
hotel_data.loc[~hotel_data['hotel_name'].str.contains('South'),'Hotel_loc'] = 'Miami Beach' | |
hotel_data.loc[hotel_data['hotel_amenities'].str.contains(',Beach,'), 'Beach_front'] = "Y" | |
hotel_data.loc[~hotel_data['hotel_amenities'].str.contains(',Beach,'), 'Beach_front'] = "N" | |
hotel_data.loc[hotel_data['hotel_name'].str.contains('Marriott'),'Hotel_Grp'] = 'Chain Hotel' | |
hotel_data.loc[hotel_data['hotel_name'].str.contains('Hilton'),'Hotel_Grp'] = 'Chain Hotel' | |
hotel_data.loc[hotel_data['hotel_name'].str.contains('Best Western'),'Hotel_Grp'] = 'Chain Hotel' | |
hotel_data.loc[hotel_data['hotel_name'].str.contains('Hyatt'),'Hotel_Grp'] = 'Chain Hotel' | |
hotel_data.loc[hotel_data['hotel_name'].str.contains('Hampton Inn'),'Hotel_Grp'] = 'Chain Hotel' | |
hotel_data.loc[hotel_data['hotel_name'].str.contains('Holiday Inn'),'Hotel_Grp'] = 'Chain Hotel' | |
hotel_data.loc[hotel_data['hotel_name'].str.contains('Westgate'),'Hotel_Grp'] = 'Chain Hotel' | |
hotel_data.loc[hotel_data['hotel_name'].str.contains('Ritz'),'Hotel_Grp'] = 'Chain Hotel' | |
hotel_data.loc[hotel_data['hotel_name'].str.contains('Crowne Plaza'),'Hotel_Grp'] = 'Chain Hotel' | |
hotel_data.loc[hotel_data['hotel_name'].str.contains('Kimpton'),'Hotel_Grp'] = 'Chain Hotel' | |
hotel_data.loc[hotel_data['hotel_name'].str.contains('Lexington'),'Hotel_Grp'] = 'Chain Hotel' | |
hotel_data.loc[hotel_data['hotel_name'].str.contains('Pestana'),'Hotel_Grp'] = 'Chain Hotel' | |
hotel_data.loc[hotel_data['hotel_name'].str.contains('Four Points'),'Hotel_Grp'] = 'Chain Hotel' | |
hotel_data.loc[hotel_data['pop_review_words'].str.contains('boutique'),'Hotel_Grp'] = 'Boutique Hotel' | |
hotel_data['Hotel_Grp'] = hotel_data['Hotel_Grp'].fillna("Local Hotel") | |
#hotel_grp vs Starting_price | |
plt.figure(figsize=(12,6)) | |
sns.boxplot(x='Hotel_Grp', y='start_price', data=hotel_data) | |
# Bar chart for hotel_grp count vs hotel rating | |
plt.figure(figsize=(12,6)) | |
hotel_data.groupby('Hotel_Grp')['hotel_rating'].median().sort_values(ascending=False).plot.bar(color='b') | |
# Bar chart for hotel_grp count vs hotel reviews | |
plt.figure(figsize=(12,6)) | |
hotel_data.groupby('Hotel_Grp')['hotel_reviews'].median().sort_values(ascending=False).plot.bar(color='b') | |
# Bar chart for hotel_grp count vs hotel location | |
plt.figure(figsize=(12,6)) | |
hotel_data.groupby('Hotel_loc')['start_price'].median().sort_values(ascending=False).plot.bar(color='b') | |
# NLP for boutique hotels | |
from textblob import TextBlob | |
hotel_boutique = hotel_data.loc[(hotel_data.Hotel_Grp == "Boutique Hotel") & (hotel_data.hotel_reviews > 100)] | |
sample_size = 20 | |
def sentiment_func(x): | |
sentiment = TextBlob(x['pop_review_words']) | |
x['polarity'] = sentiment.polarity | |
x['subjectivity'] = sentiment.subjectivity | |
return x | |
sample = hotel_boutique.sample(sample_size).apply(sentiment_func, axis=1) | |
sample.plot.scatter('hotel_reviews', 'polarity') | |
# NLP for local hotels | |
from textblob import TextBlob | |
hotel_others = hotel_data.loc[(hotel_data.Hotel_Grp == "Local Hotel") & (hotel_data.hotel_reviews > 100)] | |
sample_size = 100 | |
def sentiment_func(x): | |
sentiment = TextBlob(x['pop_review_words']) | |
x['polarity'] = sentiment.polarity | |
x['subjectivity'] = sentiment.subjectivity | |
return x | |
sample = hotel_others.sample(sample_size).apply(sentiment_func, axis=1) | |
sample.plot.scatter('hotel_reviews', 'polarity') | |
# NLP for Chain hotels | |
from textblob import TextBlob | |
hotel_chains = hotel_data.loc[(hotel_data.Hotel_Grp == "Chain Hotel") & (hotel_data.hotel_reviews > 100)] | |
hotel_chains | |
sample_size = 15 | |
def sentiment_func(x): | |
sentiment = TextBlob(x['pop_review_words']) | |
x['polarity'] = sentiment.polarity | |
x['subjectivity'] = sentiment.subjectivity | |
return x | |
sample = hotel_chains.sample(sample_size).apply(sentiment_func, axis=1) | |
sample.plot.scatter('hotel_reviews', 'polarity') | |
#wordcloud generation for Hotel_Grp | |
from wordcloud import WordCloud | |
#generate word cloud for 'Local' hotel group | |
wc = WordCloud(background_color="white", max_words=2000, width=800, height=400) | |
# generate word cloud | |
wc.generate(' '.join(hotel_others['pop_review_words'])) | |
plt.figure(figsize=(12, 6)) | |
plt.imshow(wc, interpolation='bilinear') | |
plt.axis("off") | |
plt.show() | |
#generate word cloud for 'Chains' hotel group | |
wc = WordCloud(background_color="white", max_words=2000, width=800, height=400) | |
# generate word cloud | |
wc.generate(' '.join(hotel_chains['pop_review_words'])) | |
plt.figure(figsize=(12, 6)) | |
plt.imshow(wc, interpolation='bilinear') | |
plt.axis("off") | |
plt.show() | |
#generate word cloud for 'boutique' hotel group | |
wc = WordCloud(background_color="white", max_words=2000, width=800, height=400) | |
# generate word cloud | |
wc.generate(' '.join(hotel_boutique['pop_review_words'])) | |
plt.figure(figsize=(12, 6)) | |
plt.imshow(wc, interpolation='bilinear') | |
plt.axis("off") | |
plt.show() | |
PriceGrid = sns.FacetGrid(hotel_data, col='Hotel_Grp', hue="Hotel_Grp", palette="Set1", height=4) | |
PriceGrid.map(sns.distplot, "start_price") | |
hotelGrid = sns.FacetGrid(hotel_data, row='Hotel_Grp', col='Hotel_loc', hue='Beach_front', palette="Set1", height=5) | |
hotelGrid.map(sns.regplot,'start_price','hotel_rating') | |
hotelGrid.add_legend() | |
from wordcloud import WordCloud |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment