Skip to content

Instantly share code, notes, and snippets.

@bbzzzz
Last active February 22, 2022 14:17
Show Gist options
  • Save bbzzzz/109b0bed25b8c7d16e8f to your computer and use it in GitHub Desktop.
Save bbzzzz/109b0bed25b8c7d16e8f to your computer and use it in GitHub Desktop.
This work collected job listing information from 4 major job search websites by web-scraping and API and aggregated search result within one output. Python modules including BeautifulSoup, urllib2, xmltodict were used.
# -*- coding: utf-8 -*-
# Contributor: Lucas Laviolet, Nisha Iyer, Mikhail Flom and Bohan Zhang
# Part 0 Preparation
#-------------------------------------------------------------------------------------------------
import urllib2
from bs4 import BeautifulSoup
import pandas as pd
# Set up server on user's computer for OAuth 2.0 based authentication and authorization
# python -m SimpleHTTPServer
from linkedin import linkedin # sudo pip install python-linkedin
import xmltodict # sudo pip install xmltodict
# Get search terms from user input
key_words = raw_input("Key Words ---> ")
seach_url_monster = '-'.join(key_words.split()) # Set search terms for Monster
search_url = '+'.join(key_words.split()) # Set search terms for Indeed and CareerBuilder
search_term = key_words # Set search terms for LinkedIn APIs
# create a dataframe of job listings from lists of the titles, companies, locations, and links
def createJobListingsDF(titles,companies,cities,states,links):
jobDict = {
'Title':titles,
'Company':companies,
'City':cities,
'State':states,
'Link':links
}
jobListings = pd.DataFrame(jobDict)
return jobListings
# Part 1 Webscraping Monster.com
#-------------------------------------------------------------------------------------------------
#recursively scrape monster.com for jobs given a URL and starting page number
#(default is 1) and output a dataframe of job listings
def getMonsterJobs(monsterURL,page=1,titles = [],companies = [],cities = [], states = [],links = []):
#define variable later used as a test to see if additional web pages
#contain additional job listings
startLength = len(titles)
#scrape the website for job listings
monsterPage = urllib2.urlopen(monsterURL+str(page))
soup = BeautifulSoup(monsterPage)
jobs = soup.findAll('table',{'class':'listingsTable'})
#iterate through each job listing
for job in jobs:
titleBlocks = job.findAll('div',{'class':'jobTitleContainer'})
for titleBlock in titleBlocks:
#find all titles
title = str(titleBlock.text.strip())
titles.append(title)
#find all links
link = titleBlock.find('a')['href']
links.append(link)
#find all companies
companyBlocks = job.findAll('div',{'class':'companyContainer'})
for companyBlock in companyBlocks:
company = companyBlock.find('a')['title']
companies.append(company)
#find all locations
locationBlocks = job.findAll('div',{'class':'jobLocationSingleLine'})
for locationBlock in locationBlocks:
try:
location = locationBlock.find('a')['title']
except TypeError:
location = 'No location listed'
city, space, state = location.partition(', ')
cities.append(city)
states.append(state[:2])
#test if web page added any new job listings or if it contained no new data
#if no entries were added, end the web scraping; otherwise, scrape the next page
endLength = len(titles)
if endLength > startLength:
page = page + 1
getMonsterJobs(monsterURL,page)
#create dataframe of monster.com jobs from lists of titles, companies, locations, and links
monsterJobs = createJobListingsDF(titles,companies,cities,states,links)
return monsterJobs
# Part 2 Webscraping Indeed.com
#-------------------------------------------------------------------------------------------------
# generate a list with all page urls
def countIndeedJobs():
# combine base url with user-defined search terms
# first page
baseUrl = 'http://www.indeed.com/jobs?q=' + search_url +'&filter=0&start='
pagesUrl = urllib2.urlopen(baseUrl)
soup = BeautifulSoup(pagesUrl)
# get the total number of all pages
totalListings = int(soup.find('div', {'id':'searchCount'}).text[16:])
pages = range(0, totalListings, 10)
myUrls = []
# generate urls for page 2 and above
for apage in pages:
myUrls.append(baseUrl + str(apage))
return myUrls
# Parse webpage and return lists containing titles, companies, cities, states, and links
def getIndeedPage(aUrl):
jobsPage = urllib2.urlopen(aUrl)
soup = BeautifulSoup(jobsPage)
jobs = soup.findAll('td',{'id':'resultsCol'})
titles = []
companies = []
cities = []
states = []
links = []
for job in jobs:
titleBlocks = job.findAll('div',{'itemtype':'http://schema.org/JobPosting'})
for titleBlock in titleBlocks:
title = titleBlock.find('a')['title']
titles.append(title)
link = 'www.indeed.com' + titleBlock.find('a')['href']
links.append(link)
companyBlocks = job.findAll('span',{'itemtype':'http://schema.org/Organization'})
for companyBlock in companyBlocks:
company = companyBlock.get_text('span',{'itemprop':'name'})
companies.append(company)
locationBlocks = job.findAll('span',{'itemtype':'http://schema.org/Postaladdress'})
for locationBlock in locationBlocks:
location = locationBlock.get_text('span',{'itemprop':'addressLocality'})
city, space, state = location.partition(', ')
cities.append(city)
states.append(state[:2])
return [titles,companies,cities,states,links]
# Iterate through all webpages and convert lists to dataframe
def getIndeedJobs(titles = [],companies = [],cities = [], states = [],links = []):
myUrls = countIndeedJobs()
for aUrl in myUrls:
data = getIndeedPage(aUrl)
titles = titles + data[0]
companies = companies + data[1]
cities = cities + data[2]
states = states + data[3]
links = links + data[4]
allJobs = createJobListingsDF(titles,companies,cities,states,links)
return allJobs
# Part 3 Webscraping CareerBuilder.com
#-------------------------------------------------------------------------------------------------
CB_URL = 'http://www.careerbuilder.com/jobseeker/jobs/jobresults.aspx?IPath=QH&qb=1&s_rawwords=' + search_url +'&s_freeloc=&s_jobtypes=ALL&sc_cmp2=js_findjob_home&FindJobHomeButton=hptest_ignore2'
CB_Page = urllib2.urlopen(CB_URL)
soup = BeautifulSoup(CB_Page)
#find list of URLs for all pages after 1st page of search
nextpage = soup.findAll('td',{'class':'nav_btm_cell'})
other = soup.findAll('a',{'class':'JL_MXDLPagination2_next'})
for page in nextpage:
link = page.find('a')['href']
number = page.text.strip()
a = number[number.find('of ')+3:number.find(' |')-1]
totalPages= int(a)
baseURL = "http://www.careerbuilder.com/jobseeker/jobs/jobresults.aspx?excrit=st%3da%3buse%3dALL%3brawWords%3d"+ search_url +"%3bCID%3dUS%3bSID%3d%3f%3bTID%3d0%3bLOCCID%3dUS%3bENR%3dNO%3bDTP%3dDRNS%3bYDI%3dYES%3bIND%3dALL%3bPDQ%3dAll%3bPDQ%3dAll%3bPAYL%3d0%3bPAYH%3dgt120%3bPOY%3dNO%3bETD%3dALL%3bRE%3dALL%3bMGT%3dDC%3bSUP%3dDC%3bFRE%3d30%3bCHL%3dAL%3bQS%3dsid_unknown%3bSS%3dNO%3bTITL%3d0%3bOB%3d-relv%3bJQT%3dRAD%3bJDV%3dFalse%3bSITEENT%3dUSJob%3bMaxLowExp%3d-1%3bRecsPerPage%3d25&amp&pg="
#pagelinks is a list of all the URLs we need to scrape after the first page
pageLinks =[]
iterator=2
for iterator in range(2,totalPages):
modifiedURL= baseURL+str(iterator)+'&IPath=QHKV'
pageLinks.append(modifiedURL)
#function to scrape each page
def getCBJobs(CB_URL):
CB_Page = urllib2.urlopen(CB_URL)
soup = BeautifulSoup(CB_Page)
jobs = soup.findAll('tr')
titles = []
links = []
locations = []
companies = []
for job in jobs:
titleBlocks = job.findAll('td',{'class':'jl_col2'})
titleHead = job.findAll('a',{'class':'jt prefTitle'})
#find all titles and remove missing values
for title in titleHead:
titles.append(title.text)
titles = [x for x in titles if x != None]
#find all links
for title in titleBlocks:
link = title.find('a')['href']
links.append(link)
#find all companies
companyBlocks = job.findAll('td',{'class':'jl_col3'})
for companyBlock in companyBlocks:
company = companyBlock.findAll('strong')
company = str(companyBlock.text.strip())
companies.append(company)
#find all locations
locationBlocks = job.findAll('td',{'class':'jl_col4'})
for locationBlock in locationBlocks:
location = locationBlock.find('div',{'class':'jl_col4_div'})
location = str(locationBlock.text.strip())
locations.append(location)
return titles,links,companies,locations
# Create list containing required information
Titles = []
Links = []
Companies = []
Locations = []
for aPage in pageLinks:
CB_URL = aPage
All = getCBJobs(CB_URL)
Titles = Titles + (All[0])
Links = Links + (All[1])
Companies = Companies + (All[2])
Locations = Locations +(All[3])
States = []
Cities = []
for location in Locations:
state = location.split('-')[0].strip()
States.append(state)
city = location.split('-')[1].strip()
Cities.append(city)
CBdict = {
'Company': Companies,
'Title': Titles,
'City': Cities,
'State': States,
'Link': Links
}
# Output as dataframe
CB_FinalDF = pd.DataFrame(CBdict)
# Part 4 Get job lists from LinkedIn API
#-------------------------------------------------------------------------------------------------
# Set token and secret for LinkedIn API - OAuth 1.0
CONSUMER_KEY = '758bcqo3nipdwk'
CONSUMER_SECRET = 'mUNd9c51xi5jDtlg'
USER_TOKEN = 'b86af9a8-1757-42de-a8cc-60acb6f61eb9'
USER_SECRET = 'af85d9ce-d082-4411-ad3b-1763e07a5ab2'
RETURN_URL = 'http://localhost:8000'
# Setup connection with LinkedIn
authentication = linkedin.LinkedInDeveloperAuthentication(CONSUMER_KEY, CONSUMER_SECRET,
USER_TOKEN, USER_SECRET,
RETURN_URL, linkedin.PERMISSIONS.enums.values())
application = linkedin.LinkedInApplication(authentication)
# get total number of available job titles
total = application.search_job(params={'keywords': search_term,
'start':0, 'count': 20, 'country-code':'us'})['numResults']
# Comment above line and uncomment following line if searching for specific job titles
#total = application.search_job(params={'job-title': 'Data Scientist', 'start':0, 'count': 20, 'country-code':'us'})['numResults']
job_list = []
raw_job_list = []
start = 0
# Retrieve job information using LinkedIn Job Search API
for i in range(0,total+1,20):
# Retrieve 20 jobs on every call to LinkedIn Job Search API and store in a list
raw_job_list = application.search_job(params={'keywords': search_term,
'start':0, 'count': 20, 'country-code':'us'})
# Comment above line and uncomment following line if searching for specific job titles
#raw_job_list = application.search_job(params={'job-title': 'Data Scientist', 'start':i, 'count': 20, 'country-code':'us'})
# Parse the list containing job information
for job in raw_job_list['jobs']['values']:
term = []
# LinkedIn Job Search API return job ID instead of Job Title
term.append(job['id'])
term.append(job['company']['name'])
# Split location information into City and State
flag = 0
if 'locationDescription' in job:
location = job['locationDescription']
flag = location.find(',')
if flag > 0:
city = location[0:flag]
state = location[flag+2:len(location)]
if flag < 0:
city = location
state = ''
term.append(city)
term.append(state)
job_list.append(term)
# Retrieve job titles with LinkedIn Get Job API and job IDs
def get_job_title(job_id):
# Send request to LinkedIn server using OAuth 2.0 based authentication
url = 'https://api.linkedin.com/v1/jobs/' + str(job_id) + '?oauth2_access_token=AQVIHZX39PPbvEC9mPzDPTVze3zuZvDp4BFGn9tGfnvb3GKXmgS_AKCRNT_y85nyb8f6HAWLIHIruJM5XVKGo5dAy7cbn5rEq0Zwt63D2D1BnpX-otZVvHvmxL8uJnfQDDeuZuL6sgVF8avXK88PAPJsY7i-qtqqSi35oBNSqWR_sy4oRwc'
# read url
file = urllib2.urlopen(url)
data = file.read()
file.close()
# Parse returned xml file
data = xmltodict.parse(data)
# Get job title corresponding to each job ID
Job_title = data['job']['position']['title']
return Job_title
for job in job_list:
job.append(get_job_title(job[0]))
# Create list containing required information
Job_Title = [x[4] for x in job_list]
Company = [x[1] for x in job_list]
City = [x[2] for x in job_list]
State = [x[3] for x in job_list]
Link = ['http://www.linkedin.com/jobs?viewJob=&jobId='+str(x[0]) for x in job_list]
Job_Dict = {
'Title': Job_Title,
'Company': Company,
'City': City,
'State': State,
'Link': Link
}
# Output as dataframe
Job_DF = pd.DataFrame(Job_Dict)
Job_DF_DD = Job_DF.drop_duplicates()
# Part 5 Combine all parts together and output as csv
#-------------------------------------------------------------------------------------------------
#define the Monster URL of interest and actually run Monster code
monsterURL = 'http://jobsearch.monster.com/search/?q='+seach_url_monster+'&pg='
monsterJobs = getMonsterJobs(monsterURL)
print "Monster jobs: " + str(len(monsterJobs))
monsterJobs.to_csv('monsterJobs1.csv')
#actually run Indeed code
indeedJobs = getIndeedJobs()
print "Indeed jobs: " + str(len(indeedJobs))
indeedJobs.to_csv('indeedJobs1.csv')
print "CareerBuilder jobs: " + str(len(CB_FinalDF))
print "LinkedIn jobs: " + str(len(Job_DF_DD))
allJobs = pd.concat([monsterJobs,indeedJobs,CB_FinalDF,Job_DF_DD], keys=['monster.com','indeed.com','careerbuilder.com','linkedin.com'])
print "Monster, Indeed, CareerBuilder, and LinkedIn jobs: " + str(len(allJobs))
allJobsRed=allJobs.drop_duplicates(cols=('Title','Company','City','State'))
print "Monster, Indeed, CareerBuilder, and LinkedIn jobs (no duplicates): " + str(len(allJobsRed))
print "Duplicates removed: " + str(len(allJobs)-len(allJobsRed))
allJobsRed.to_csv('allJobs.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment