kawing-ho/link_collector.py

## link_collector.py
#!/usr/bin/python

from os import getpid
from re import search
from time import sleep
from random import randint
from selenium import webdriver
from pyvirtualdisplay import Display
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC

class LinkSpider():
	def __init__(self):				#can loop over by specifying ?adults= 1..16
		self.url_to_crawl = "https://www.airbnb.com/s/New-South-Wales--Australia/homes"
		self.all_items = []

	# Open headless chromedriver
	def start_driver(self):
		print('starting driver...')
		self.display = Display(visible=0, size=(800, 600))
		self.display.start()

		options = webdriver.ChromeOptions()
		options.add_argument(" - incognito")
		options.add_argument('--disable-extensions')
		options.add_argument('--no-sandbox')
		options.add_argument('--headless')
		options.add_argument('--disable-gpu')
		options.add_argument('--disable-setuid-sandbox')
		options.add_argument('--allow-running-insecure-content')
		options.add_argument('--ignore-certificate-errors')

		# don't load images // use cache
		prefs={"profile.managed_default_content_settings.images": 2, 'disk-cache-size': 4096 }
		options.add_experimental_option("prefs", prefs)

		# make sure you have a chromedriver-executable installed, either in path or give the path
		# exit staus 127 probably means some libraries are missing, try running it to see what it is
		# sudo apt install libgconf2-4 might help

		try:
			print('attempting to start chrome...')
			self.driver = webdriver.Chrome(executable_path="/var/chromedriver/chromedriver", chrome_options=options)
#			self.driver.implicitly_wait(20)
			self.driver.set_page_load_timeout(10)
		except Exception as e:
			print('couldn\'t launch chrome')
			print(str(e))
			self.display.stop()
			self.driver.quit()

	# Close chromedriver
	def close_driver(self):
		print('closing driver...')
		self.display.stop()
		self.driver.quit()
		print('closed!')

	# Tell the browser to get a page
	def get_page(self, url):
		print('getting page: {}'.format(url))

		try:
			self.driver.get(url)
		except TimeoutException:
			print("Timed out on loading the page")
			self.close_driver()
			exit(1)


	def grab_list_items(self):
		print('grabbing list of items...')

		###################
		extract_amount = 300	#300
		###################

		count = 0
		exception_count = 0
		pause_time = 1
		while count < extract_amount:
			try:
				print("\ntry loading page first ...")
				sleep(pause_time)

				#load the div elements and scrape the links
				for c, div in enumerate(self.driver.find_elements_by_class_name('_1szwzht')):
					print(div.text)
					try: data = div.find_element_by_css_selector('._1szwzht a').get_attribute('href')
					except Exception as e:
						print(str(e))
						pass
					if data and 'plus' not in data:
						self.all_items.append(data)
						count += 1
					else:
						pass

				#click to go to next page
				# there are four elements with class "_1ip5u88", we want the last one which is the ">" button
				buttons = self.driver.find_elements_by_class_name('_1ip5u88')
				buttons[3].click()


			except Exception as e:
				exception_count += 1
				print(e)
				print(self.driver.page_source)
				if exception_count < 100: continue
				print("killing because too many exceptions")
				self.close_driver()
				exit(1)

	def parse(self):
		self.start_driver()

		#loop through number of guests
		for guests in range(17):
			print("--- Crawling through ?guests={} ---".format(guests))
			sleep(2)

			self.get_page(self.url_to_crawl + "?adults=" + str(guests))
			self.grab_list_items()

			items_list = list(set(self.all_items))
			items_list = [ search("/(\d+)\?",item).group(1) for item in items_list]

			#offload the items to file
			with open('/tmp/links_'+str(getpid), 'a') as f:
				for i in items_list:
					f.write(str(i)+'\n')

			#empty it back again
			self.all_items = []

		self.close_driver()

		if self.all_items:
			return self.all_items
		else:
			return False, False

# Create the empty file to store in
open('/tmp/links_' + str(getpid()), 'w').close()

# Run spider
Link = LinkSpider()
items_list = Link.parse()
#items_list = list(set(items_list))

# extract links
#items_list = [ search("/(\d+)\?",item).group(1) for item in items_list]
#print(len(items_list))

# Do something with the data touched
#with open("/tmp/links", 'w+') as f:
#	for item in items_list:
		#print(item)
#		f.write(item+'\n')

print("================ DONE ===============")
	#!/usr/bin/python

	from os import getpid
	from re import search
	from time import sleep
	from random import randint
	from selenium import webdriver
	from pyvirtualdisplay import Display
	from selenium.webdriver.common.by import By
	from selenium.webdriver.common.keys import Keys
	from selenium.common.exceptions import TimeoutException
	from selenium.webdriver.support.ui import WebDriverWait, Select
	from selenium.webdriver.support import expected_conditions as EC

	class LinkSpider():
	def __init__(self): #can loop over by specifying ?adults= 1..16
	self.url_to_crawl = "https://www.airbnb.com/s/New-South-Wales--Australia/homes"
	self.all_items = []

	# Open headless chromedriver
	def start_driver(self):
	print('starting driver...')
	self.display = Display(visible=0, size=(800, 600))
	self.display.start()

	options = webdriver.ChromeOptions()
	options.add_argument(" - incognito")
	options.add_argument('--disable-extensions')
	options.add_argument('--no-sandbox')
	options.add_argument('--headless')
	options.add_argument('--disable-gpu')
	options.add_argument('--disable-setuid-sandbox')
	options.add_argument('--allow-running-insecure-content')
	options.add_argument('--ignore-certificate-errors')

	# don't load images // use cache
	prefs={"profile.managed_default_content_settings.images": 2, 'disk-cache-size': 4096 }
	options.add_experimental_option("prefs", prefs)

	# make sure you have a chromedriver-executable installed, either in path or give the path
	# exit staus 127 probably means some libraries are missing, try running it to see what it is
	# sudo apt install libgconf2-4 might help

	try:
	print('attempting to start chrome...')
	self.driver = webdriver.Chrome(executable_path="/var/chromedriver/chromedriver", chrome_options=options)
	# self.driver.implicitly_wait(20)
	self.driver.set_page_load_timeout(10)
	except Exception as e:
	print('couldn\'t launch chrome')
	print(str(e))
	self.display.stop()
	self.driver.quit()

	# Close chromedriver
	def close_driver(self):
	print('closing driver...')
	self.display.stop()
	self.driver.quit()
	print('closed!')

	# Tell the browser to get a page
	def get_page(self, url):
	print('getting page: {}'.format(url))

	try:
	self.driver.get(url)
	except TimeoutException:
	print("Timed out on loading the page")
	self.close_driver()
	exit(1)


	def grab_list_items(self):
	print('grabbing list of items...')

	###################
	extract_amount = 300 #300
	###################

	count = 0
	exception_count = 0
	pause_time = 1
	while count < extract_amount:
	try:
	print("\ntry loading page first ...")
	sleep(pause_time)

	#load the div elements and scrape the links
	for c, div in enumerate(self.driver.find_elements_by_class_name('_1szwzht')):
	print(div.text)
	try: data = div.find_element_by_css_selector('._1szwzht a').get_attribute('href')
	except Exception as e:
	print(str(e))
	pass
	if data and 'plus' not in data:
	self.all_items.append(data)
	count += 1
	else:
	pass

	#click to go to next page
	# there are four elements with class "_1ip5u88", we want the last one which is the ">" button
	buttons = self.driver.find_elements_by_class_name('_1ip5u88')
	buttons[3].click()


	except Exception as e:
	exception_count += 1
	print(e)
	print(self.driver.page_source)
	if exception_count < 100: continue
	print("killing because too many exceptions")
	self.close_driver()
	exit(1)

	def parse(self):
	self.start_driver()

	#loop through number of guests
	for guests in range(17):
	print("--- Crawling through ?guests={} ---".format(guests))
	sleep(2)

	self.get_page(self.url_to_crawl + "?adults=" + str(guests))
	self.grab_list_items()

	items_list = list(set(self.all_items))
	items_list = [ search("/(\d+)\?",item).group(1) for item in items_list]

	#offload the items to file
	with open('/tmp/links_'+str(getpid), 'a') as f:
	for i in items_list:
	f.write(str(i)+'\n')

	#empty it back again
	self.all_items = []

	self.close_driver()

	if self.all_items:
	return self.all_items
	else:
	return False, False

	# Create the empty file to store in
	open('/tmp/links_' + str(getpid()), 'w').close()

	# Run spider
	Link = LinkSpider()
	items_list = Link.parse()
	#items_list = list(set(items_list))

	# extract links
	#items_list = [ search("/(\d+)\?",item).group(1) for item in items_list]
	#print(len(items_list))

	# Do something with the data touched
	#with open("/tmp/links", 'w+') as f:
	# for item in items_list:
	#print(item)
	# f.write(item+'\n')

	print("================ DONE ===============")