khunreus/airbnb_product_catalog_scraper.py

## airbnb_product_catalog_scraper.py
"""
python3.6
part of the Scrapy architecture
goes through the "Homes" page of airbnb.ae and gets property name, price per night, rating per property
"""

from time import sleep
import scrapy, selenium
from selenium import webdriver
from scrapy import Spider
from scrapy.selector import Selector
#from scrapy.http import Request
#from selenium.webdriver import ActionChains
#from selenium.webdriver.support.wait import WebDriverWait
#from selenium.webdriver.common.by import By
#from selenium.webdriver.support import expected_conditions as EC
#from selenium.common.exceptions import TimeoutException
#from selenium.webdriver.common.keys import Keys
#from selenium.webdriver.remote.webdriver import WebDriver

class HomesSpider(Spider):
	name = 'homes'
	start_urls = ('http://books.toscrape.com/',) # a dummy website

	def parse(self, response):
		self.driver = webdriver.Chrome('/path/to_your/chromedriver')
# do Options(--no-sandbox)
		wait = WebDriverWait(self.driver, 60)
		self.driver.get('https://www.airbnb.ae/')
		action_chains = ActionChains(self.driver)
		search_city = self.driver.find_element_by_xpath('//*[@type="text"]').send_keys('Dubai h\ue007') #one of the ways to hit "ENTER"
		sleep(0.8)
		search_button = self.driver.find_element_by_xpath('//*[@type="submit"]')
		search_button.click()
		sleep(8.7)
		homes_button = self.driver.find_element_by_xpath('//*[@data-veloute="explore-nav-card:/homes"]')
		homes_button.click()
		sleep(4.2)
		# scroll down the infinite scroll
		last_height = self.driver.execute_script("return document.body.scrollHeight")
		SCROLL_PAUSE_TIME = 7
		while True:
			self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
			sleep(SCROLL_PAUSE_TIME)
			new_height = self.driver.execute_script("return document.body.scrollHeight")
			if new_height == last_height:
				break
			last_height = new_height
			sleep(1.2)
		scrapy_selector = Selector(text = self.driver.page_source)
		homes_selector = scrapy_selector.xpath('//*[@itemtype="http://schema.org/ListItem"]') #name of an item can be changed by Airbnb
		try:
			i = 0
			j = 1
			q = 1
			p = 2
			for home_selector in homes_selector:
				 # airbnb is quite tricky in this sense
				 # if getting all the attributes will not work, can follow the links and get descriptions from there
				 # property type is an issue!
				url = home_selector.xpath('//*[@itemprop = "url"]/@content').extract()[i]
				name = home_selector.xpath('//*[@itemprop="name"]/@content').extract()[i]
				price_local = home_selector.xpath('//*[@class="_12kw8n71"]/span/text()').extract()[j]
				rating = home_selector.xpath('//*[@class="_q27mtmr"]//@aria-label').extract()[i]
				yield{
					'url': url,
					'name': name,
					'price_local': price_local,
					'rating': rating
					}
				i = i+1
				j = j+2
		except:
			self.logger.info('Reached the last iteration')
	"""
	python3.6
	part of the Scrapy architecture
	goes through the "Homes" page of airbnb.ae and gets property name, price per night, rating per property
	"""

	from time import sleep
	import scrapy, selenium
	from selenium import webdriver
	from scrapy import Spider
	from scrapy.selector import Selector
	#from scrapy.http import Request
	#from selenium.webdriver import ActionChains
	#from selenium.webdriver.support.wait import WebDriverWait
	#from selenium.webdriver.common.by import By
	#from selenium.webdriver.support import expected_conditions as EC
	#from selenium.common.exceptions import TimeoutException
	#from selenium.webdriver.common.keys import Keys
	#from selenium.webdriver.remote.webdriver import WebDriver

	class HomesSpider(Spider):
	name = 'homes'
	start_urls = ('http://books.toscrape.com/',) # a dummy website

	def parse(self, response):
	self.driver = webdriver.Chrome('/path/to_your/chromedriver')
	# do Options(--no-sandbox)
	wait = WebDriverWait(self.driver, 60)
	self.driver.get('https://www.airbnb.ae/')
	action_chains = ActionChains(self.driver)
	search_city = self.driver.find_element_by_xpath('//*[@type="text"]').send_keys('Dubai h\ue007') #one of the ways to hit "ENTER"
	sleep(0.8)
	search_button = self.driver.find_element_by_xpath('//*[@type="submit"]')
	search_button.click()
	sleep(8.7)
	homes_button = self.driver.find_element_by_xpath('//*[@data-veloute="explore-nav-card:/homes"]')
	homes_button.click()
	sleep(4.2)
	# scroll down the infinite scroll
	last_height = self.driver.execute_script("return document.body.scrollHeight")
	SCROLL_PAUSE_TIME = 7
	while True:
	self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
	sleep(SCROLL_PAUSE_TIME)
	new_height = self.driver.execute_script("return document.body.scrollHeight")
	if new_height == last_height:
	break
	last_height = new_height
	sleep(1.2)
	scrapy_selector = Selector(text = self.driver.page_source)
	homes_selector = scrapy_selector.xpath('//*[@itemtype="http://schema.org/ListItem"]') #name of an item can be changed by Airbnb
	try:
	i = 0
	j = 1
	q = 1
	p = 2
	for home_selector in homes_selector:
	# airbnb is quite tricky in this sense
	# if getting all the attributes will not work, can follow the links and get descriptions from there
	# property type is an issue!
	url = home_selector.xpath('//*[@itemprop = "url"]/@content').extract()[i]
	name = home_selector.xpath('//*[@itemprop="name"]/@content').extract()[i]
	price_local = home_selector.xpath('//*[@class="_12kw8n71"]/span/text()').extract()[j]
	rating = home_selector.xpath('//*[@class="_q27mtmr"]//@aria-label').extract()[i]
	yield{
	'url': url,
	'name': name,
	'price_local': price_local,
	'rating': rating
	}
	i = i+1
	j = j+2
	except:
	self.logger.info('Reached the last iteration')