indeed_scraping_202002
from time import sleep | |
from selenium import webdriver | |
from selenium.common.exceptions import ElementNotVisibleException | |
from selenium.common.exceptions import NoSuchElementException | |
import pandas as pd | |
import random | |
# define scraping function | |
def scrape_indeed(search,loc, limit = 50, canada=False): | |
# search_term is the keyword/designation to be searched | |
search_term = search.replace(' ','+') | |
if canada: | |
url = 'https://www.indeed.ca/jobs?q={}&l={}&limit={}&radius=25&start=0'.format(search_term, loc, limit) | |
else: | |
url = 'https://www.indeed.com/jobs?q={}&l={}&limit={}&radius=25&start=0'.format(search_term, loc, limit) | |
# Start the browser and load the above URL | |
browser = webdriver.Chrome('/Users/justin/Downloads/chromedriver') | |
browser.get(url) | |
# Empty dataframe in which we will store our data scraped from job posts | |
data = pd.DataFrame(columns = ['job_title','company', 'location', 'job_description']) | |
x = 0 | |
# get the number of results. This determines | |
num_results = browser.find_element_by_id('searchCountPages').text | |
ind0 = num_results.find('of ') + 3 | |
ind1 = num_results.find(' ', ind0) | |
num_results = int(num_results[ind0:ind1]) | |
pages = math.ceil(num_results/limit) # the number of pages to visit. | |
# Loop through the pages | |
for j in range(pages): | |
# All the job posts have class 'row result clickcard'. | |
job_elements = browser.find_elements_by_xpath("//div[@class='jobsearch-SerpJobCard unifiedRow row result clickcard']") | |
# Loop through the individual job posts | |
for i in range(len(job_elements)): | |
# Click on the job post | |
job_elements[i].click() | |
# Sleep for minimum 3 seconds because we dont want to create unnecessary load on Indeed's servers | |
sleep(3 + random.randint(0,3)) | |
# Sometimes Selenium might start scraping before the page finishes loading or | |
# we might encounter '404 : Job not found error' | |
# Although these occurences are very rare we don't want our job scrapper to crash. | |
# Therefore we will retry before moving on. | |
# If the data was successfully scrapped then it will break out of the for loop | |
# If we encounter error it will retry again provided the retry count is below 5 | |
done = False | |
for k in range(0,5): | |
try: | |
title = browser.find_element_by_id('vjs-jobtitle').text | |
company = browser.find_element_by_id('vjs-cn').text | |
company = company.replace('- ', '') | |
location = browser.find_element_by_id('vjs-loc').text | |
description = browser.find_element_by_id('vjs-desc').text | |
done = True | |
break | |
except NoSuchElementException: | |
print('Unable to fetch data. Retrying.....') | |
if not done: | |
continue | |
# For debugging purposes lets log the job post scrapped | |
print('Completed Post {} of Page {} - {}'.format(i+1,j+1,title)) | |
# Insert the data into our dataframe | |
data = data.append({'job_title':title, | |
'company':company, | |
'location':location, | |
'job_description':description},ignore_index=True) | |
# Change the URL, so as to move on to the next page | |
url = url.replace('start=' + str(x),'start=' +str(x+limit)) | |
x += limit | |
if len(job_elements) < limit: | |
break | |
browser.get(url) | |
print('Moving on to page ' + str(j+2)) | |
sleep(2) | |
# A popover appears when we go to the next page. We will tell the browser to click on close button. | |
# Although so far for me it has appeared only on 2nd page but I have included the check for every page to be on safer side | |
try: | |
browser.find_element_by_id('popover-x').click() | |
except: | |
print('No Newsletter Popup Found') | |
browser.close() | |
return data | |
# download data, use Toronto as an example | |
loc = 'Toronto%2C+ON' | |
q = 'title%3A%28machine+learning%29' | |
df0 = scrape_indeed(q, loc, 50, True) # Jan 25 | |
df0.to_pickle('data_scientist_toronto.pkl') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment