Skip to content

Instantly share code, notes, and snippets.

@mrt-kousha
Created January 3, 2019 13:10
Show Gist options
  • Save mrt-kousha/649610de70512d448cafea759ac0ee5e to your computer and use it in GitHub Desktop.
Save mrt-kousha/649610de70512d448cafea759ac0ee5e to your computer and use it in GitHub Desktop.
virgool-tutorial
##
import scrapy
class HotelExtractor(scrapy.Spider):
name = 'hotel_info'
custom_settings = {
'CONCURRENT_REQUESTS': 16,
'DOWNLOAD_DELAY': 1,
'ROBOTSTXT_OBEY': False,
}
def start_requests(self):
url = "https://hotelyar.com/%D9%84%DB%8C%D8%B3%D8%AA-%D9%87%D8%AA%D9%84-%D9%87%D8%A7%DB%8C-%D8%A7%DB%8C%D8%B1%D8%A7%D9%86"
yield scrapy.Request(url=url, callback=self.extract_hotel_urls, dont_filter=True)
def extract_hotel_urls(self, response):
links = response.xpath('//a/@href').extract()
links = list(set(links))
for link in links:
if len(link.split('hotel')) == 3:
yield scrapy.Request(url=link, callback=self.extract_hotel_info, dont_filter=True)
def extract_hotel_info(self, response):
yield{
'hotel_name': response.xpath('//div[@class="hotel-ltl-title"]/h1/text()').extract_first(),
'hotel_location': response.xpath('//span[@class="hotel-lt-details"]/span/text()').extract_first(),
'base_url': response.url
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment