Chaitali20-gh/Tripadvisor_visual_NLP.py Secret

## tripadvisor_items.py
import scrapy


class TripadvisorItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    hotel_name = scrapy.Field()
    start_price = scrapy.Field()
    hotel_rating = scrapy.Field()
    hotel_reviews = scrapy.Field()
    hotel_amenities = scrapy.Field()
    pop_review_words = scrapy.Field()
    nearby_restaurant = scrapy.Field()
    walkable_rating = scrapy.Field()
    loc_attraction = scrapy.Field()

## tripadvisor_pipelines.py
from scrapy.exporters import CsvItemExporter


class WriteItemPipeline(object):

    def __init__(self):
        self.filename = 'tripadvisor_miami_hotels.csv'

    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

## tripadvisor_settings.py
# Scrapy settings for tripadvisor project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'tripadvisor'

SPIDER_MODULES = ['tripadvisor.spiders']
NEWSPIDER_MODULE = 'tripadvisor.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'tripadvisor (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'tripadvisor.middlewares.TripadvisorSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'tripadvisor.middlewares.TripadvisorDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'tripadvisor.pipelines.WriteItemPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

## tripadvisor_spider.py
from scrapy import Spider, Request
from tripadvisor.items import TripadvisorItem


class TripadvisorSpider(Spider):
    name = 'tripadvisor_spider'
    allowed_domains = ['www.tripadvisor.com']
    start_urls = ['https://www.tripadvisor.com/Hotels-g34439-Miami_Beach_Florida-Hotels.html']


    def parse(self, response):
        #pg_num = response.xpath('//a[@class="pageNum "]/text()').extract()
        pg_num = response.xpath('//a[@class="pageNum last "]/text()').extract()
        result_urls = [f'https://www.tripadvisor.com/Hotels-g34439-oa{i*30}-Miami_Beach_Florida-Hotels.html' for i in range (0,14)]

        for url in result_urls:
            yield Request(url=url, callback=self.parse_result_page)


    def parse_result_page(self,response):
          # This function parses the search result page.

          detail_urls = response.xpath('//div[@class="listing_title"]/a/@href').extract()
          for url in ['https://www.tripadvisor.com{}'.format(x) for x in detail_urls]:
              yield Request(url=url, callback=self.parse_detail_page)


    def parse_detail_page(self,response):
            hotel_name = response.xpath('//h1[@class="_1mTlpMC3"]/text()').extract()
            start_price = response.xpath('//div[@data-provider!=""]/@data-pernight').extract_first()
            hotel_rating = response.xpath('//span[@class="_3cjYfwwQ"]/text()').extract()
            hotel_reviews = response.xpath('//span[@class="_33O9dg0j"]/text()').extract()
            hotel_reviews = int(hotel_reviews[0].replace("reviews","").replace(",",""))
            hotel_amenities = response.xpath('//div[@class="_2rdvbNSg"]/text()').extract()
            pop_review_words = response.xpath('//button[@class="ui_button secondary small H5_EAgqY"]/text()').extract()
            nearby_restaurant = response.xpath('//span[@class="oPMurIUj TrfXbt7b"]/text()').extract()
            walkable_rating = response.xpath('//span[@class="oPMurIUj _1iwDIdby"]/text()').extract()
            loc_attraction = response.xpath('//span[@class="oPMurIUj _1WE0iyL_"]/text()').extract()

            item = TripadvisorItem()
            item['hotel_name'] = hotel_name
            item['start_price'] = start_price
            item['hotel_rating'] = hotel_rating
            item['hotel_reviews'] = hotel_reviews
            item['hotel_amenities'] = hotel_amenities
            item['pop_review_words'] = pop_review_words
            item['nearby_restaurant'] = nearby_restaurant
            item['walkable_rating'] = walkable_rating
            item['loc_attraction'] = loc_attraction

            yield item

## Tripadvisor_visual_NLP.py
import pandas as pd
hotel_orig_data = pd.read_csv('tripadvisor_miami_hotels.csv')
hotel_orig_data

%matplotlib inline
from matplotlib import pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

import numpy as np
np.sum(hotel_orig_data.isnull())
hotel_data = hotel_orig_data.copy()
hotel_data = hotel_data.dropna(subset=['start_price'])

hotel_data.describe
hotel_data['pop_review_words'] = hotel_data['pop_review_words'].fillna("Others")
hotel_data['hotel_amenities'] = hotel_data['hotel_amenities'].fillna("Others")

#Hotel Start_Price Plot
plt.hist(hotel_data['start_price'])

# Data Manipulation
hotel_data.loc[hotel_data['hotel_name'].str.contains('South'),'Hotel_loc'] = 'South Beach'
hotel_data.loc[~hotel_data['hotel_name'].str.contains('South'),'Hotel_loc'] = 'Miami Beach'
hotel_data.loc[hotel_data['hotel_amenities'].str.contains(',Beach,'), 'Beach_front'] = "Y"
hotel_data.loc[~hotel_data['hotel_amenities'].str.contains(',Beach,'), 'Beach_front'] = "N"

hotel_data.loc[hotel_data['hotel_name'].str.contains('Marriott'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Hilton'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Best Western'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Hyatt'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Hampton Inn'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Holiday Inn'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Westgate'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Ritz'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Crowne Plaza'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Kimpton'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Lexington'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Pestana'),'Hotel_Grp'] = 'Chain Hotel'
hotel_data.loc[hotel_data['hotel_name'].str.contains('Four Points'),'Hotel_Grp'] = 'Chain Hotel'

hotel_data.loc[hotel_data['pop_review_words'].str.contains('boutique'),'Hotel_Grp'] = 'Boutique Hotel'
hotel_data['Hotel_Grp'] = hotel_data['Hotel_Grp'].fillna("Local Hotel")

#hotel_grp vs Starting_price
plt.figure(figsize=(12,6))
sns.boxplot(x='Hotel_Grp', y='start_price', data=hotel_data)

# Bar chart for hotel_grp count vs hotel rating
plt.figure(figsize=(12,6))
hotel_data.groupby('Hotel_Grp')['hotel_rating'].median().sort_values(ascending=False).plot.bar(color='b')

# Bar chart for hotel_grp count vs hotel reviews
plt.figure(figsize=(12,6))
hotel_data.groupby('Hotel_Grp')['hotel_reviews'].median().sort_values(ascending=False).plot.bar(color='b')

# Bar chart for hotel_grp count vs hotel location
plt.figure(figsize=(12,6))
hotel_data.groupby('Hotel_loc')['start_price'].median().sort_values(ascending=False).plot.bar(color='b')

# NLP for boutique hotels
from textblob import TextBlob
hotel_boutique = hotel_data.loc[(hotel_data.Hotel_Grp == "Boutique Hotel") & (hotel_data.hotel_reviews > 100)]
sample_size = 20

def sentiment_func(x):
    sentiment = TextBlob(x['pop_review_words'])
    x['polarity'] = sentiment.polarity
    x['subjectivity'] = sentiment.subjectivity
    return x

sample = hotel_boutique.sample(sample_size).apply(sentiment_func, axis=1)
sample.plot.scatter('hotel_reviews', 'polarity')

# NLP for local hotels
from textblob import TextBlob
hotel_others = hotel_data.loc[(hotel_data.Hotel_Grp == "Local Hotel") & (hotel_data.hotel_reviews > 100)]
sample_size = 100

def sentiment_func(x):
    sentiment = TextBlob(x['pop_review_words'])
    x['polarity'] = sentiment.polarity
    x['subjectivity'] = sentiment.subjectivity
    return x

sample = hotel_others.sample(sample_size).apply(sentiment_func, axis=1)

sample.plot.scatter('hotel_reviews', 'polarity')

# NLP for Chain hotels
from textblob import TextBlob
hotel_chains = hotel_data.loc[(hotel_data.Hotel_Grp == "Chain Hotel") & (hotel_data.hotel_reviews > 100)]
hotel_chains

sample_size = 15

def sentiment_func(x):
    sentiment = TextBlob(x['pop_review_words'])
    x['polarity'] = sentiment.polarity
    x['subjectivity'] = sentiment.subjectivity
    return x

sample = hotel_chains.sample(sample_size).apply(sentiment_func, axis=1)
sample.plot.scatter('hotel_reviews', 'polarity')

#wordcloud generation for Hotel_Grp
from wordcloud import WordCloud

#generate word cloud for 'Local' hotel group
wc = WordCloud(background_color="white", max_words=2000, width=800, height=400)
# generate word cloud
wc.generate(' '.join(hotel_others['pop_review_words']))
plt.figure(figsize=(12, 6))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

#generate word cloud for 'Chains' hotel group
wc = WordCloud(background_color="white", max_words=2000, width=800, height=400)
# generate word cloud
wc.generate(' '.join(hotel_chains['pop_review_words']))
plt.figure(figsize=(12, 6))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

#generate word cloud for 'boutique' hotel group
wc = WordCloud(background_color="white", max_words=2000, width=800, height=400)
# generate word cloud
wc.generate(' '.join(hotel_boutique['pop_review_words']))
plt.figure(figsize=(12, 6))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()


PriceGrid = sns.FacetGrid(hotel_data, col='Hotel_Grp', hue="Hotel_Grp", palette="Set1", height=4)
PriceGrid.map(sns.distplot, "start_price")
hotelGrid = sns.FacetGrid(hotel_data, row='Hotel_Grp', col='Hotel_loc', hue='Beach_front', palette="Set1", height=5)
hotelGrid.map(sns.regplot,'start_price','hotel_rating')
hotelGrid.add_legend()


from wordcloud import WordCloud
	import scrapy


	class TripadvisorItem(scrapy.Item):
	# define the fields for your item here like:
	# name = scrapy.Field()
	hotel_name = scrapy.Field()
	start_price = scrapy.Field()
	hotel_rating = scrapy.Field()
	hotel_reviews = scrapy.Field()
	hotel_amenities = scrapy.Field()
	pop_review_words = scrapy.Field()
	nearby_restaurant = scrapy.Field()
	walkable_rating = scrapy.Field()
	loc_attraction = scrapy.Field()
	from scrapy.exporters import CsvItemExporter


	class WriteItemPipeline(object):

	def __init__(self):
	self.filename = 'tripadvisor_miami_hotels.csv'

	def open_spider(self, spider):
	self.csvfile = open(self.filename, 'wb')
	self.exporter = CsvItemExporter(self.csvfile)
	self.exporter.start_exporting()

	def close_spider(self, spider):
	self.exporter.finish_exporting()
	self.csvfile.close()

	def process_item(self, item, spider):
	self.exporter.export_item(item)
	return item
	# Scrapy settings for tripadvisor project
	#
	# For simplicity, this file contains only settings considered important or
	# commonly used. You can find more settings consulting the documentation:
	#
	# https://docs.scrapy.org/en/latest/topics/settings.html
	# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
	# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

	BOT_NAME = 'tripadvisor'

	SPIDER_MODULES = ['tripadvisor.spiders']
	NEWSPIDER_MODULE = 'tripadvisor.spiders'


	# Crawl responsibly by identifying yourself (and your website) on the user-agent
	#USER_AGENT = 'tripadvisor (+http://www.yourdomain.com)'

	# Obey robots.txt rules
	ROBOTSTXT_OBEY = False

	# Configure maximum concurrent requests performed by Scrapy (default: 16)
	#CONCURRENT_REQUESTS = 32

	# Configure a delay for requests for the same website (default: 0)
	# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
	# See also autothrottle settings and docs
	DOWNLOAD_DELAY = 3
	# The download delay setting will honor only one of:
	#CONCURRENT_REQUESTS_PER_DOMAIN = 16
	#CONCURRENT_REQUESTS_PER_IP = 16

	# Disable cookies (enabled by default)
	#COOKIES_ENABLED = False

	# Disable Telnet Console (enabled by default)
	#TELNETCONSOLE_ENABLED = False

	# Override the default request headers:
	#DEFAULT_REQUEST_HEADERS = {
	# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
	# 'Accept-Language': 'en',
	#}

	# Enable or disable spider middlewares
	# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
	#SPIDER_MIDDLEWARES = {
	# 'tripadvisor.middlewares.TripadvisorSpiderMiddleware': 543,
	#}

	# Enable or disable downloader middlewares
	# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
	#DOWNLOADER_MIDDLEWARES = {
	# 'tripadvisor.middlewares.TripadvisorDownloaderMiddleware': 543,
	#}

	# Enable or disable extensions
	# See https://docs.scrapy.org/en/latest/topics/extensions.html
	#EXTENSIONS = {
	# 'scrapy.extensions.telnet.TelnetConsole': None,
	#}

	# Configure item pipelines
	# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
	ITEM_PIPELINES = {
	'tripadvisor.pipelines.WriteItemPipeline': 300,
	}

	# Enable and configure the AutoThrottle extension (disabled by default)
	# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
	#AUTOTHROTTLE_ENABLED = True
	# The initial download delay
	#AUTOTHROTTLE_START_DELAY = 5
	# The maximum download delay to be set in case of high latencies
	#AUTOTHROTTLE_MAX_DELAY = 60
	# The average number of requests Scrapy should be sending in parallel to
	# each remote server
	#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
	# Enable showing throttling stats for every response received:
	#AUTOTHROTTLE_DEBUG = False

	# Enable and configure HTTP caching (disabled by default)
	# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
	#HTTPCACHE_ENABLED = True
	#HTTPCACHE_EXPIRATION_SECS = 0
	#HTTPCACHE_DIR = 'httpcache'
	#HTTPCACHE_IGNORE_HTTP_CODES = []
	#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
	from scrapy import Spider, Request
	from tripadvisor.items import TripadvisorItem


	class TripadvisorSpider(Spider):
	name = 'tripadvisor_spider'
	allowed_domains = ['www.tripadvisor.com']
	start_urls = ['https://www.tripadvisor.com/Hotels-g34439-Miami_Beach_Florida-Hotels.html']



	def parse(self, response):
	#pg_num = response.xpath('//a[@class="pageNum "]/text()').extract()
	pg_num = response.xpath('//a[@class="pageNum last "]/text()').extract()
	result_urls = [f'https://www.tripadvisor.com/Hotels-g34439-oa{i*30}-Miami_Beach_Florida-Hotels.html' for i in range (0,14)]

	for url in result_urls:
	yield Request(url=url, callback=self.parse_result_page)


	def parse_result_page(self,response):
	# This function parses the search result page.

	detail_urls = response.xpath('//div[@class="listing_title"]/a/@href').extract()
	for url in ['https://www.tripadvisor.com{}'.format(x) for x in detail_urls]:
	yield Request(url=url, callback=self.parse_detail_page)


	def parse_detail_page(self,response):
	hotel_name = response.xpath('//h1[@class="_1mTlpMC3"]/text()').extract()
	start_price = response.xpath('//div[@data-provider!=""]/@data-pernight').extract_first()
	hotel_rating = response.xpath('//span[@class="_3cjYfwwQ"]/text()').extract()
	hotel_reviews = response.xpath('//span[@class="_33O9dg0j"]/text()').extract()
	hotel_reviews = int(hotel_reviews[0].replace("reviews","").replace(",",""))
	hotel_amenities = response.xpath('//div[@class="_2rdvbNSg"]/text()').extract()
	pop_review_words = response.xpath('//button[@class="ui_button secondary small H5_EAgqY"]/text()').extract()
	nearby_restaurant = response.xpath('//span[@class="oPMurIUj TrfXbt7b"]/text()').extract()
	walkable_rating = response.xpath('//span[@class="oPMurIUj _1iwDIdby"]/text()').extract()
	loc_attraction = response.xpath('//span[@class="oPMurIUj _1WE0iyL_"]/text()').extract()

	item = TripadvisorItem()
	item['hotel_name'] = hotel_name
	item['start_price'] = start_price
	item['hotel_rating'] = hotel_rating
	item['hotel_reviews'] = hotel_reviews
	item['hotel_amenities'] = hotel_amenities
	item['pop_review_words'] = pop_review_words
	item['nearby_restaurant'] = nearby_restaurant
	item['walkable_rating'] = walkable_rating
	item['loc_attraction'] = loc_attraction

	yield item
	import pandas as pd
	hotel_orig_data = pd.read_csv('tripadvisor_miami_hotels.csv')
	hotel_orig_data

	%matplotlib inline
	from matplotlib import pyplot as plt
	plt.style.use('ggplot')
	import seaborn as sns

	import numpy as np
	np.sum(hotel_orig_data.isnull())
	hotel_data = hotel_orig_data.copy()
	hotel_data = hotel_data.dropna(subset=['start_price'])

	hotel_data.describe
	hotel_data['pop_review_words'] = hotel_data['pop_review_words'].fillna("Others")
	hotel_data['hotel_amenities'] = hotel_data['hotel_amenities'].fillna("Others")

	#Hotel Start_Price Plot
	plt.hist(hotel_data['start_price'])

	# Data Manipulation
	hotel_data.loc[hotel_data['hotel_name'].str.contains('South'),'Hotel_loc'] = 'South Beach'
	hotel_data.loc[~hotel_data['hotel_name'].str.contains('South'),'Hotel_loc'] = 'Miami Beach'
	hotel_data.loc[hotel_data['hotel_amenities'].str.contains(',Beach,'), 'Beach_front'] = "Y"
	hotel_data.loc[~hotel_data['hotel_amenities'].str.contains(',Beach,'), 'Beach_front'] = "N"

	hotel_data.loc[hotel_data['hotel_name'].str.contains('Marriott'),'Hotel_Grp'] = 'Chain Hotel'
	hotel_data.loc[hotel_data['hotel_name'].str.contains('Hilton'),'Hotel_Grp'] = 'Chain Hotel'
	hotel_data.loc[hotel_data['hotel_name'].str.contains('Best Western'),'Hotel_Grp'] = 'Chain Hotel'
	hotel_data.loc[hotel_data['hotel_name'].str.contains('Hyatt'),'Hotel_Grp'] = 'Chain Hotel'
	hotel_data.loc[hotel_data['hotel_name'].str.contains('Hampton Inn'),'Hotel_Grp'] = 'Chain Hotel'
	hotel_data.loc[hotel_data['hotel_name'].str.contains('Holiday Inn'),'Hotel_Grp'] = 'Chain Hotel'
	hotel_data.loc[hotel_data['hotel_name'].str.contains('Westgate'),'Hotel_Grp'] = 'Chain Hotel'
	hotel_data.loc[hotel_data['hotel_name'].str.contains('Ritz'),'Hotel_Grp'] = 'Chain Hotel'
	hotel_data.loc[hotel_data['hotel_name'].str.contains('Crowne Plaza'),'Hotel_Grp'] = 'Chain Hotel'
	hotel_data.loc[hotel_data['hotel_name'].str.contains('Kimpton'),'Hotel_Grp'] = 'Chain Hotel'
	hotel_data.loc[hotel_data['hotel_name'].str.contains('Lexington'),'Hotel_Grp'] = 'Chain Hotel'
	hotel_data.loc[hotel_data['hotel_name'].str.contains('Pestana'),'Hotel_Grp'] = 'Chain Hotel'
	hotel_data.loc[hotel_data['hotel_name'].str.contains('Four Points'),'Hotel_Grp'] = 'Chain Hotel'

	hotel_data.loc[hotel_data['pop_review_words'].str.contains('boutique'),'Hotel_Grp'] = 'Boutique Hotel'
	hotel_data['Hotel_Grp'] = hotel_data['Hotel_Grp'].fillna("Local Hotel")

	#hotel_grp vs Starting_price
	plt.figure(figsize=(12,6))
	sns.boxplot(x='Hotel_Grp', y='start_price', data=hotel_data)

	# Bar chart for hotel_grp count vs hotel rating
	plt.figure(figsize=(12,6))
	hotel_data.groupby('Hotel_Grp')['hotel_rating'].median().sort_values(ascending=False).plot.bar(color='b')

	# Bar chart for hotel_grp count vs hotel reviews
	plt.figure(figsize=(12,6))
	hotel_data.groupby('Hotel_Grp')['hotel_reviews'].median().sort_values(ascending=False).plot.bar(color='b')

	# Bar chart for hotel_grp count vs hotel location
	plt.figure(figsize=(12,6))
	hotel_data.groupby('Hotel_loc')['start_price'].median().sort_values(ascending=False).plot.bar(color='b')

	# NLP for boutique hotels
	from textblob import TextBlob
	hotel_boutique = hotel_data.loc[(hotel_data.Hotel_Grp == "Boutique Hotel") & (hotel_data.hotel_reviews > 100)]
	sample_size = 20

	def sentiment_func(x):
	sentiment = TextBlob(x['pop_review_words'])
	x['polarity'] = sentiment.polarity
	x['subjectivity'] = sentiment.subjectivity
	return x

	sample = hotel_boutique.sample(sample_size).apply(sentiment_func, axis=1)
	sample.plot.scatter('hotel_reviews', 'polarity')

	# NLP for local hotels
	from textblob import TextBlob
	hotel_others = hotel_data.loc[(hotel_data.Hotel_Grp == "Local Hotel") & (hotel_data.hotel_reviews > 100)]
	sample_size = 100

	def sentiment_func(x):
	sentiment = TextBlob(x['pop_review_words'])
	x['polarity'] = sentiment.polarity
	x['subjectivity'] = sentiment.subjectivity
	return x

	sample = hotel_others.sample(sample_size).apply(sentiment_func, axis=1)

	sample.plot.scatter('hotel_reviews', 'polarity')

	# NLP for Chain hotels
	from textblob import TextBlob
	hotel_chains = hotel_data.loc[(hotel_data.Hotel_Grp == "Chain Hotel") & (hotel_data.hotel_reviews > 100)]
	hotel_chains

	sample_size = 15

	def sentiment_func(x):
	sentiment = TextBlob(x['pop_review_words'])
	x['polarity'] = sentiment.polarity
	x['subjectivity'] = sentiment.subjectivity
	return x

	sample = hotel_chains.sample(sample_size).apply(sentiment_func, axis=1)
	sample.plot.scatter('hotel_reviews', 'polarity')

	#wordcloud generation for Hotel_Grp
	from wordcloud import WordCloud

	#generate word cloud for 'Local' hotel group
	wc = WordCloud(background_color="white", max_words=2000, width=800, height=400)
	# generate word cloud
	wc.generate(' '.join(hotel_others['pop_review_words']))
	plt.figure(figsize=(12, 6))
	plt.imshow(wc, interpolation='bilinear')
	plt.axis("off")
	plt.show()

	#generate word cloud for 'Chains' hotel group
	wc = WordCloud(background_color="white", max_words=2000, width=800, height=400)
	# generate word cloud
	wc.generate(' '.join(hotel_chains['pop_review_words']))
	plt.figure(figsize=(12, 6))
	plt.imshow(wc, interpolation='bilinear')
	plt.axis("off")
	plt.show()

	#generate word cloud for 'boutique' hotel group
	wc = WordCloud(background_color="white", max_words=2000, width=800, height=400)
	# generate word cloud
	wc.generate(' '.join(hotel_boutique['pop_review_words']))
	plt.figure(figsize=(12, 6))
	plt.imshow(wc, interpolation='bilinear')
	plt.axis("off")
	plt.show()


	PriceGrid = sns.FacetGrid(hotel_data, col='Hotel_Grp', hue="Hotel_Grp", palette="Set1", height=4)
	PriceGrid.map(sns.distplot, "start_price")
	hotelGrid = sns.FacetGrid(hotel_data, row='Hotel_Grp', col='Hotel_loc', hue='Beach_front', palette="Set1", height=5)
	hotelGrid.map(sns.regplot,'start_price','hotel_rating')
	hotelGrid.add_legend()


	from wordcloud import WordCloud