mrt-kousha/hotels.py

## hotels.py
##
import scrapy

class HotelExtractor(scrapy.Spider):
    name = 'hotel_info'

    custom_settings = {
        'CONCURRENT_REQUESTS': 16,
        'DOWNLOAD_DELAY': 1,
        'ROBOTSTXT_OBEY': False,
    }

    def start_requests(self):
        url = "https://hotelyar.com/%D9%84%DB%8C%D8%B3%D8%AA-%D9%87%D8%AA%D9%84-%D9%87%D8%A7%DB%8C-%D8%A7%DB%8C%D8%B1%D8%A7%D9%86"

        yield scrapy.Request(url=url, callback=self.extract_hotel_urls, dont_filter=True)

    def extract_hotel_urls(self, response):
        links = response.xpath('//a/@href').extract()

        links = list(set(links))

        for link in links:
            if len(link.split('hotel')) == 3:
                yield scrapy.Request(url=link, callback=self.extract_hotel_info, dont_filter=True)

    def extract_hotel_info(self, response):

        yield{
            'hotel_name': response.xpath('//div[@class="hotel-ltl-title"]/h1/text()').extract_first(),
            'hotel_location': response.xpath('//span[@class="hotel-lt-details"]/span/text()').extract_first(),
            'base_url': response.url
        }
	##
	import scrapy

	class HotelExtractor(scrapy.Spider):
	name = 'hotel_info'

	custom_settings = {
	'CONCURRENT_REQUESTS': 16,
	'DOWNLOAD_DELAY': 1,
	'ROBOTSTXT_OBEY': False,
	}

	def start_requests(self):
	url = "https://hotelyar.com/%D9%84%DB%8C%D8%B3%D8%AA-%D9%87%D8%AA%D9%84-%D9%87%D8%A7%DB%8C-%D8%A7%DB%8C%D8%B1%D8%A7%D9%86"

	yield scrapy.Request(url=url, callback=self.extract_hotel_urls, dont_filter=True)

	def extract_hotel_urls(self, response):
	links = response.xpath('//a/@href').extract()

	links = list(set(links))

	for link in links:
	if len(link.split('hotel')) == 3:
	yield scrapy.Request(url=link, callback=self.extract_hotel_info, dont_filter=True)

	def extract_hotel_info(self, response):

	yield{
	'hotel_name': response.xpath('//div[@class="hotel-ltl-title"]/h1/text()').extract_first(),
	'hotel_location': response.xpath('//span[@class="hotel-lt-details"]/span/text()').extract_first(),
	'base_url': response.url
	}