francoisstamant/scraping_hostels_2

## scraping_hostels_2
##############################
#First loop: getting the URLs
##############################

pages = np.arange(1, 3, 1)
url_collected=[]

for page in pages:
    page="https://www.hostelworld.com/s?q=Barcelona,%20Catalonia,%20Spain&country=Spain&city=Barcelona&type=city&id=83&from=2020-07-03&to=2020-07-05&guests=1&page=" + str(page)
    driver = webdriver.Chrome()
    driver.get(page)
    sleep(randint(5,15))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    urls = [item.get("href") for item in soup.find_all("a")]


#Remove duplicates and none values
urls_final = list(dict.fromkeys(url_collected))
urls_final = list(filter(None, urls_final))

#Remove if not starting with pwa, remove if ending with display=reviews
url_final = [x for x in urls_final if x.startswith('/pwa/')]
url_final = [x for x in url_final if not x.endswith('display=reviews')]

string = 'https://www.hostelworld.com'
final_list=[string + s for s in url_final]
	##############################
	#First loop: getting the URLs
	##############################

	pages = np.arange(1, 3, 1)
	url_collected=[]

	for page in pages:
	page="https://www.hostelworld.com/s?q=Barcelona,%20Catalonia,%20Spain&country=Spain&city=Barcelona&type=city&id=83&from=2020-07-03&to=2020-07-05&guests=1&page=" + str(page)
	driver = webdriver.Chrome()
	driver.get(page)
	sleep(randint(5,15))
	soup = BeautifulSoup(driver.page_source, 'html.parser')
	urls = [item.get("href") for item in soup.find_all("a")]


	#Remove duplicates and none values
	urls_final = list(dict.fromkeys(url_collected))
	urls_final = list(filter(None, urls_final))

	#Remove if not starting with pwa, remove if ending with display=reviews
	url_final = [x for x in urls_final if x.startswith('/pwa/')]
	url_final = [x for x in url_final if not x.endswith('display=reviews')]

	string = 'https://www.hostelworld.com'
	final_list=[string + s for s in url_final]