scrapehero-code/tripadvisor.py

## tripadvisor.py
# -*- coding: utf-8 -*-
import scrapy
from csv import DictReader
from os import path
from tripadvisor_restaurants.items import TripadvisorRestaurantsItem
from urllib.parse import urljoin

class TripadvisorRestaurantsSpiderSpider(scrapy.Spider):
    name = 'tripadvisor_restaurants_spider'
    allowed_domains = ['tripadvisor.com']
    base_url = 'http://tripadvisor.com'
    start_urls = ['http://tripadvisor.com/']

    def start_requests(self):
        """Read URLs from file"""
        with open(path.join(path.dirname(__file__), "../resources/urls.csv")) as urls:
            for url in DictReader(urls):
                listing_page_url = url["url"]
                yield scrapy.Request(listing_page_url, callback = self.parse)

    def clean(self,text):
        # Removing \n,\r \t.
        if text:
            return ' '.join(''.join(text).split())
        return None


    def get_rating(self, raw_rating):
        # Cleanig rating.
        if raw_rating:
            return  ''.join(raw_rating).replace("of 5 bubbles","")
        return None


    def get_category(self, raw_category):
        # Converting list to comma separated values
        if raw_category:
            return ','.join(raw_category)
        return None

    def get_review(self, raw_review_count):
        if raw_review_count:
            cleaned_review_count = self.clean(raw_review_count)
            if cleaned_review_count:
                return cleaned_review_count.replace('reviews','')
        return None

    def get_absolute_url(self,relative_url):
        base_url = 'http://tripadvisor.com'
        url = urljoin(base_url, relative_url)
        return url

    def parse(self, response):
        # Parsing Tripadvisor Listing Page.
        XPATH_RESULTS = "//div[@id='EATERY_SEARCH_RESULTS']/div[contains(@class,'listing')]"
        restaurants = response.xpath(XPATH_RESULTS)

        # Iterating over the list of restaurants
        for restaurant in restaurants:
            # Defining XPaths.
            XPATH_NAME = './/a[@class="property_title"]/text()'
            XPATH_LINK = './/a[@class="property_title"]/@href'
            XPATH_REVIEW_COUNT = './/span[@class="reviewCount"]//text()'
            XPATH_RANK = './/div[@class="popIndexBlock"]//text()'
            XPATH_PRICING = './/span[@class="item price"]//text()'
            XPATH_CATEGORY = './/a[@class="item cuisine"]//text()'
            XPATH_RATING = './/div[contains(@class,"rating")]//span[contains(@class,"rating")]/@alt'

            # Getting data from XPath.
            raw_name = restaurant.xpath(XPATH_NAME).extract()
            raw_link = restaurant.xpath(XPATH_LINK).extract()
            raw_review_count = restaurant.xpath(XPATH_REVIEW_COUNT).extract()
            raw_rank = restaurant.xpath(XPATH_RANK).extract()
            raw_pricing = restaurant.xpath(XPATH_PRICING).extract()
            raw_category = restaurant.xpath(XPATH_CATEGORY).extract()
            raw_rating = restaurant.xpath(XPATH_RATING).extract()

            # Cleaning data.
            name = self.clean(raw_name)
            restaurant_link = self.get_absolute_url(self.clean(raw_link))
            review_count = self.get_review(raw_review_count)
            rank = self.clean(raw_rank)
            pricing = self.clean(raw_pricing)
            category = self.get_category(raw_category)
            rating = self.get_rating(raw_rating)

            restaurant_data = {
                'name': name,
                'url': restaurant_link,
                'reviews': review_count,
                'rank': rank,
                'price_range': pricing,
                'category': category,
                'rating': rating,
                'listing_page':response.url
            }
            yield TripadvisorRestaurantsItem(**restaurant_data)
	# -- coding: utf-8 --
	import scrapy
	from csv import DictReader
	from os import path
	from tripadvisor_restaurants.items import TripadvisorRestaurantsItem
	from urllib.parse import urljoin

	class TripadvisorRestaurantsSpiderSpider(scrapy.Spider):
	name = 'tripadvisor_restaurants_spider'
	allowed_domains = ['tripadvisor.com']
	base_url = 'http://tripadvisor.com'
	start_urls = ['http://tripadvisor.com/']

	def start_requests(self):
	"""Read URLs from file"""
	with open(path.join(path.dirname(__file__), "../resources/urls.csv")) as urls:
	for url in DictReader(urls):
	listing_page_url = url["url"]
	yield scrapy.Request(listing_page_url, callback = self.parse)

	def clean(self,text):
	# Removing \n,\r \t.
	if text:
	return ' '.join(''.join(text).split())
	return None


	def get_rating(self, raw_rating):
	# Cleanig rating.
	if raw_rating:
	return ''.join(raw_rating).replace("of 5 bubbles","")
	return None


	def get_category(self, raw_category):
	# Converting list to comma separated values
	if raw_category:
	return ','.join(raw_category)
	return None

	def get_review(self, raw_review_count):
	if raw_review_count:
	cleaned_review_count = self.clean(raw_review_count)
	if cleaned_review_count:
	return cleaned_review_count.replace('reviews','')
	return None

	def get_absolute_url(self,relative_url):
	base_url = 'http://tripadvisor.com'
	url = urljoin(base_url, relative_url)
	return url

	def parse(self, response):
	# Parsing Tripadvisor Listing Page.
	XPATH_RESULTS = "//div[@id='EATERY_SEARCH_RESULTS']/div[contains(@class,'listing')]"
	restaurants = response.xpath(XPATH_RESULTS)

	# Iterating over the list of restaurants
	for restaurant in restaurants:
	# Defining XPaths.
	XPATH_NAME = './/a[@class="property_title"]/text()'
	XPATH_LINK = './/a[@class="property_title"]/@href'
	XPATH_REVIEW_COUNT = './/span[@class="reviewCount"]//text()'
	XPATH_RANK = './/div[@class="popIndexBlock"]//text()'
	XPATH_PRICING = './/span[@class="item price"]//text()'
	XPATH_CATEGORY = './/a[@class="item cuisine"]//text()'
	XPATH_RATING = './/div[contains(@class,"rating")]//span[contains(@class,"rating")]/@alt'

	# Getting data from XPath.
	raw_name = restaurant.xpath(XPATH_NAME).extract()
	raw_link = restaurant.xpath(XPATH_LINK).extract()
	raw_review_count = restaurant.xpath(XPATH_REVIEW_COUNT).extract()
	raw_rank = restaurant.xpath(XPATH_RANK).extract()
	raw_pricing = restaurant.xpath(XPATH_PRICING).extract()
	raw_category = restaurant.xpath(XPATH_CATEGORY).extract()
	raw_rating = restaurant.xpath(XPATH_RATING).extract()

	# Cleaning data.
	name = self.clean(raw_name)
	restaurant_link = self.get_absolute_url(self.clean(raw_link))
	review_count = self.get_review(raw_review_count)
	rank = self.clean(raw_rank)
	pricing = self.clean(raw_pricing)
	category = self.get_category(raw_category)
	rating = self.get_rating(raw_rating)

	restaurant_data = {
	'name': name,
	'url': restaurant_link,
	'reviews': review_count,
	'rank': rank,
	'price_range': pricing,
	'category': category,
	'rating': rating,
	'listing_page':response.url
	}
	yield TripadvisorRestaurantsItem(**restaurant_data)