Last active
May 11, 2021 16:14
-
-
Save liannewriting/a08e549f186067837856494513250ff1 to your computer and use it in GitHub Desktop.
indeed_scraping_202002
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from time import sleep | |
from selenium import webdriver | |
from selenium.common.exceptions import ElementNotVisibleException | |
from selenium.common.exceptions import NoSuchElementException | |
import pandas as pd | |
import random | |
# define scraping function | |
def scrape_indeed(search,loc, limit = 50, canada=False): | |
# search_term is the keyword/designation to be searched | |
search_term = search.replace(' ','+') | |
if canada: | |
url = 'https://www.indeed.ca/jobs?q={}&l={}&limit={}&radius=25&start=0'.format(search_term, loc, limit) | |
else: | |
url = 'https://www.indeed.com/jobs?q={}&l={}&limit={}&radius=25&start=0'.format(search_term, loc, limit) | |
# Start the browser and load the above URL | |
browser = webdriver.Chrome('/Users/justin/Downloads/chromedriver') | |
browser.get(url) | |
# Empty dataframe in which we will store our data scraped from job posts | |
data = pd.DataFrame(columns = ['job_title','company', 'location', 'job_description']) | |
x = 0 | |
# get the number of results. This determines | |
num_results = browser.find_element_by_id('searchCountPages').text | |
ind0 = num_results.find('of ') + 3 | |
ind1 = num_results.find(' ', ind0) | |
num_results = int(num_results[ind0:ind1]) | |
pages = math.ceil(num_results/limit) # the number of pages to visit. | |
# Loop through the pages | |
for j in range(pages): | |
# All the job posts have class 'row result clickcard'. | |
job_elements = browser.find_elements_by_xpath("//div[@class='jobsearch-SerpJobCard unifiedRow row result clickcard']") | |
# Loop through the individual job posts | |
for i in range(len(job_elements)): | |
# Click on the job post | |
job_elements[i].click() | |
# Sleep for minimum 3 seconds because we dont want to create unnecessary load on Indeed's servers | |
sleep(3 + random.randint(0,3)) | |
# Sometimes Selenium might start scraping before the page finishes loading or | |
# we might encounter '404 : Job not found error' | |
# Although these occurences are very rare we don't want our job scrapper to crash. | |
# Therefore we will retry before moving on. | |
# If the data was successfully scrapped then it will break out of the for loop | |
# If we encounter error it will retry again provided the retry count is below 5 | |
done = False | |
for k in range(0,5): | |
try: | |
title = browser.find_element_by_id('vjs-jobtitle').text | |
company = browser.find_element_by_id('vjs-cn').text | |
company = company.replace('- ', '') | |
location = browser.find_element_by_id('vjs-loc').text | |
description = browser.find_element_by_id('vjs-desc').text | |
done = True | |
break | |
except NoSuchElementException: | |
print('Unable to fetch data. Retrying.....') | |
if not done: | |
continue | |
# For debugging purposes lets log the job post scrapped | |
print('Completed Post {} of Page {} - {}'.format(i+1,j+1,title)) | |
# Insert the data into our dataframe | |
data = data.append({'job_title':title, | |
'company':company, | |
'location':location, | |
'job_description':description},ignore_index=True) | |
# Change the URL, so as to move on to the next page | |
url = url.replace('start=' + str(x),'start=' +str(x+limit)) | |
x += limit | |
if len(job_elements) < limit: | |
break | |
browser.get(url) | |
print('Moving on to page ' + str(j+2)) | |
sleep(2) | |
# A popover appears when we go to the next page. We will tell the browser to click on close button. | |
# Although so far for me it has appeared only on 2nd page but I have included the check for every page to be on safer side | |
try: | |
browser.find_element_by_id('popover-x').click() | |
except: | |
print('No Newsletter Popup Found') | |
browser.close() | |
return data | |
# download data, use Toronto as an example | |
loc = 'Toronto%2C+ON' | |
q = 'title%3A%28machine+learning%29' | |
df0 = scrape_indeed(q, loc, 50, True) # Jan 25 | |
df0.to_pickle('data_scientist_toronto.pkl') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment