Created
January 3, 2019 13:10
-
-
Save mrt-kousha/649610de70512d448cafea759ac0ee5e to your computer and use it in GitHub Desktop.
virgool-tutorial
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## | |
import scrapy | |
class HotelExtractor(scrapy.Spider): | |
name = 'hotel_info' | |
custom_settings = { | |
'CONCURRENT_REQUESTS': 16, | |
'DOWNLOAD_DELAY': 1, | |
'ROBOTSTXT_OBEY': False, | |
} | |
def start_requests(self): | |
url = "https://hotelyar.com/%D9%84%DB%8C%D8%B3%D8%AA-%D9%87%D8%AA%D9%84-%D9%87%D8%A7%DB%8C-%D8%A7%DB%8C%D8%B1%D8%A7%D9%86" | |
yield scrapy.Request(url=url, callback=self.extract_hotel_urls, dont_filter=True) | |
def extract_hotel_urls(self, response): | |
links = response.xpath('//a/@href').extract() | |
links = list(set(links)) | |
for link in links: | |
if len(link.split('hotel')) == 3: | |
yield scrapy.Request(url=link, callback=self.extract_hotel_info, dont_filter=True) | |
def extract_hotel_info(self, response): | |
yield{ | |
'hotel_name': response.xpath('//div[@class="hotel-ltl-title"]/h1/text()').extract_first(), | |
'hotel_location': response.xpath('//span[@class="hotel-lt-details"]/span/text()').extract_first(), | |
'base_url': response.url | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment