Last active
February 22, 2022 14:17
-
-
Save bbzzzz/109b0bed25b8c7d16e8f to your computer and use it in GitHub Desktop.
This work collected job listing information from 4 major job search websites by web-scraping and API and aggregated search result within one output. Python modules including BeautifulSoup, urllib2, xmltodict were used.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Contributor: Lucas Laviolet, Nisha Iyer, Mikhail Flom and Bohan Zhang | |
# Part 0 Preparation | |
#------------------------------------------------------------------------------------------------- | |
import urllib2 | |
from bs4 import BeautifulSoup | |
import pandas as pd | |
# Set up server on user's computer for OAuth 2.0 based authentication and authorization | |
# python -m SimpleHTTPServer | |
from linkedin import linkedin # sudo pip install python-linkedin | |
import xmltodict # sudo pip install xmltodict | |
# Get search terms from user input | |
key_words = raw_input("Key Words ---> ") | |
seach_url_monster = '-'.join(key_words.split()) # Set search terms for Monster | |
search_url = '+'.join(key_words.split()) # Set search terms for Indeed and CareerBuilder | |
search_term = key_words # Set search terms for LinkedIn APIs | |
# create a dataframe of job listings from lists of the titles, companies, locations, and links | |
def createJobListingsDF(titles,companies,cities,states,links): | |
jobDict = { | |
'Title':titles, | |
'Company':companies, | |
'City':cities, | |
'State':states, | |
'Link':links | |
} | |
jobListings = pd.DataFrame(jobDict) | |
return jobListings | |
# Part 1 Webscraping Monster.com | |
#------------------------------------------------------------------------------------------------- | |
#recursively scrape monster.com for jobs given a URL and starting page number | |
#(default is 1) and output a dataframe of job listings | |
def getMonsterJobs(monsterURL,page=1,titles = [],companies = [],cities = [], states = [],links = []): | |
#define variable later used as a test to see if additional web pages | |
#contain additional job listings | |
startLength = len(titles) | |
#scrape the website for job listings | |
monsterPage = urllib2.urlopen(monsterURL+str(page)) | |
soup = BeautifulSoup(monsterPage) | |
jobs = soup.findAll('table',{'class':'listingsTable'}) | |
#iterate through each job listing | |
for job in jobs: | |
titleBlocks = job.findAll('div',{'class':'jobTitleContainer'}) | |
for titleBlock in titleBlocks: | |
#find all titles | |
title = str(titleBlock.text.strip()) | |
titles.append(title) | |
#find all links | |
link = titleBlock.find('a')['href'] | |
links.append(link) | |
#find all companies | |
companyBlocks = job.findAll('div',{'class':'companyContainer'}) | |
for companyBlock in companyBlocks: | |
company = companyBlock.find('a')['title'] | |
companies.append(company) | |
#find all locations | |
locationBlocks = job.findAll('div',{'class':'jobLocationSingleLine'}) | |
for locationBlock in locationBlocks: | |
try: | |
location = locationBlock.find('a')['title'] | |
except TypeError: | |
location = 'No location listed' | |
city, space, state = location.partition(', ') | |
cities.append(city) | |
states.append(state[:2]) | |
#test if web page added any new job listings or if it contained no new data | |
#if no entries were added, end the web scraping; otherwise, scrape the next page | |
endLength = len(titles) | |
if endLength > startLength: | |
page = page + 1 | |
getMonsterJobs(monsterURL,page) | |
#create dataframe of monster.com jobs from lists of titles, companies, locations, and links | |
monsterJobs = createJobListingsDF(titles,companies,cities,states,links) | |
return monsterJobs | |
# Part 2 Webscraping Indeed.com | |
#------------------------------------------------------------------------------------------------- | |
# generate a list with all page urls | |
def countIndeedJobs(): | |
# combine base url with user-defined search terms | |
# first page | |
baseUrl = 'http://www.indeed.com/jobs?q=' + search_url +'&filter=0&start=' | |
pagesUrl = urllib2.urlopen(baseUrl) | |
soup = BeautifulSoup(pagesUrl) | |
# get the total number of all pages | |
totalListings = int(soup.find('div', {'id':'searchCount'}).text[16:]) | |
pages = range(0, totalListings, 10) | |
myUrls = [] | |
# generate urls for page 2 and above | |
for apage in pages: | |
myUrls.append(baseUrl + str(apage)) | |
return myUrls | |
# Parse webpage and return lists containing titles, companies, cities, states, and links | |
def getIndeedPage(aUrl): | |
jobsPage = urllib2.urlopen(aUrl) | |
soup = BeautifulSoup(jobsPage) | |
jobs = soup.findAll('td',{'id':'resultsCol'}) | |
titles = [] | |
companies = [] | |
cities = [] | |
states = [] | |
links = [] | |
for job in jobs: | |
titleBlocks = job.findAll('div',{'itemtype':'http://schema.org/JobPosting'}) | |
for titleBlock in titleBlocks: | |
title = titleBlock.find('a')['title'] | |
titles.append(title) | |
link = 'www.indeed.com' + titleBlock.find('a')['href'] | |
links.append(link) | |
companyBlocks = job.findAll('span',{'itemtype':'http://schema.org/Organization'}) | |
for companyBlock in companyBlocks: | |
company = companyBlock.get_text('span',{'itemprop':'name'}) | |
companies.append(company) | |
locationBlocks = job.findAll('span',{'itemtype':'http://schema.org/Postaladdress'}) | |
for locationBlock in locationBlocks: | |
location = locationBlock.get_text('span',{'itemprop':'addressLocality'}) | |
city, space, state = location.partition(', ') | |
cities.append(city) | |
states.append(state[:2]) | |
return [titles,companies,cities,states,links] | |
# Iterate through all webpages and convert lists to dataframe | |
def getIndeedJobs(titles = [],companies = [],cities = [], states = [],links = []): | |
myUrls = countIndeedJobs() | |
for aUrl in myUrls: | |
data = getIndeedPage(aUrl) | |
titles = titles + data[0] | |
companies = companies + data[1] | |
cities = cities + data[2] | |
states = states + data[3] | |
links = links + data[4] | |
allJobs = createJobListingsDF(titles,companies,cities,states,links) | |
return allJobs | |
# Part 3 Webscraping CareerBuilder.com | |
#------------------------------------------------------------------------------------------------- | |
CB_URL = 'http://www.careerbuilder.com/jobseeker/jobs/jobresults.aspx?IPath=QH&qb=1&s_rawwords=' + search_url +'&s_freeloc=&s_jobtypes=ALL&sc_cmp2=js_findjob_home&FindJobHomeButton=hptest_ignore2' | |
CB_Page = urllib2.urlopen(CB_URL) | |
soup = BeautifulSoup(CB_Page) | |
#find list of URLs for all pages after 1st page of search | |
nextpage = soup.findAll('td',{'class':'nav_btm_cell'}) | |
other = soup.findAll('a',{'class':'JL_MXDLPagination2_next'}) | |
for page in nextpage: | |
link = page.find('a')['href'] | |
number = page.text.strip() | |
a = number[number.find('of ')+3:number.find(' |')-1] | |
totalPages= int(a) | |
baseURL = "http://www.careerbuilder.com/jobseeker/jobs/jobresults.aspx?excrit=st%3da%3buse%3dALL%3brawWords%3d"+ search_url +"%3bCID%3dUS%3bSID%3d%3f%3bTID%3d0%3bLOCCID%3dUS%3bENR%3dNO%3bDTP%3dDRNS%3bYDI%3dYES%3bIND%3dALL%3bPDQ%3dAll%3bPDQ%3dAll%3bPAYL%3d0%3bPAYH%3dgt120%3bPOY%3dNO%3bETD%3dALL%3bRE%3dALL%3bMGT%3dDC%3bSUP%3dDC%3bFRE%3d30%3bCHL%3dAL%3bQS%3dsid_unknown%3bSS%3dNO%3bTITL%3d0%3bOB%3d-relv%3bJQT%3dRAD%3bJDV%3dFalse%3bSITEENT%3dUSJob%3bMaxLowExp%3d-1%3bRecsPerPage%3d25&&pg=" | |
#pagelinks is a list of all the URLs we need to scrape after the first page | |
pageLinks =[] | |
iterator=2 | |
for iterator in range(2,totalPages): | |
modifiedURL= baseURL+str(iterator)+'&IPath=QHKV' | |
pageLinks.append(modifiedURL) | |
#function to scrape each page | |
def getCBJobs(CB_URL): | |
CB_Page = urllib2.urlopen(CB_URL) | |
soup = BeautifulSoup(CB_Page) | |
jobs = soup.findAll('tr') | |
titles = [] | |
links = [] | |
locations = [] | |
companies = [] | |
for job in jobs: | |
titleBlocks = job.findAll('td',{'class':'jl_col2'}) | |
titleHead = job.findAll('a',{'class':'jt prefTitle'}) | |
#find all titles and remove missing values | |
for title in titleHead: | |
titles.append(title.text) | |
titles = [x for x in titles if x != None] | |
#find all links | |
for title in titleBlocks: | |
link = title.find('a')['href'] | |
links.append(link) | |
#find all companies | |
companyBlocks = job.findAll('td',{'class':'jl_col3'}) | |
for companyBlock in companyBlocks: | |
company = companyBlock.findAll('strong') | |
company = str(companyBlock.text.strip()) | |
companies.append(company) | |
#find all locations | |
locationBlocks = job.findAll('td',{'class':'jl_col4'}) | |
for locationBlock in locationBlocks: | |
location = locationBlock.find('div',{'class':'jl_col4_div'}) | |
location = str(locationBlock.text.strip()) | |
locations.append(location) | |
return titles,links,companies,locations | |
# Create list containing required information | |
Titles = [] | |
Links = [] | |
Companies = [] | |
Locations = [] | |
for aPage in pageLinks: | |
CB_URL = aPage | |
All = getCBJobs(CB_URL) | |
Titles = Titles + (All[0]) | |
Links = Links + (All[1]) | |
Companies = Companies + (All[2]) | |
Locations = Locations +(All[3]) | |
States = [] | |
Cities = [] | |
for location in Locations: | |
state = location.split('-')[0].strip() | |
States.append(state) | |
city = location.split('-')[1].strip() | |
Cities.append(city) | |
CBdict = { | |
'Company': Companies, | |
'Title': Titles, | |
'City': Cities, | |
'State': States, | |
'Link': Links | |
} | |
# Output as dataframe | |
CB_FinalDF = pd.DataFrame(CBdict) | |
# Part 4 Get job lists from LinkedIn API | |
#------------------------------------------------------------------------------------------------- | |
# Set token and secret for LinkedIn API - OAuth 1.0 | |
CONSUMER_KEY = '758bcqo3nipdwk' | |
CONSUMER_SECRET = 'mUNd9c51xi5jDtlg' | |
USER_TOKEN = 'b86af9a8-1757-42de-a8cc-60acb6f61eb9' | |
USER_SECRET = 'af85d9ce-d082-4411-ad3b-1763e07a5ab2' | |
RETURN_URL = 'http://localhost:8000' | |
# Setup connection with LinkedIn | |
authentication = linkedin.LinkedInDeveloperAuthentication(CONSUMER_KEY, CONSUMER_SECRET, | |
USER_TOKEN, USER_SECRET, | |
RETURN_URL, linkedin.PERMISSIONS.enums.values()) | |
application = linkedin.LinkedInApplication(authentication) | |
# get total number of available job titles | |
total = application.search_job(params={'keywords': search_term, | |
'start':0, 'count': 20, 'country-code':'us'})['numResults'] | |
# Comment above line and uncomment following line if searching for specific job titles | |
#total = application.search_job(params={'job-title': 'Data Scientist', 'start':0, 'count': 20, 'country-code':'us'})['numResults'] | |
job_list = [] | |
raw_job_list = [] | |
start = 0 | |
# Retrieve job information using LinkedIn Job Search API | |
for i in range(0,total+1,20): | |
# Retrieve 20 jobs on every call to LinkedIn Job Search API and store in a list | |
raw_job_list = application.search_job(params={'keywords': search_term, | |
'start':0, 'count': 20, 'country-code':'us'}) | |
# Comment above line and uncomment following line if searching for specific job titles | |
#raw_job_list = application.search_job(params={'job-title': 'Data Scientist', 'start':i, 'count': 20, 'country-code':'us'}) | |
# Parse the list containing job information | |
for job in raw_job_list['jobs']['values']: | |
term = [] | |
# LinkedIn Job Search API return job ID instead of Job Title | |
term.append(job['id']) | |
term.append(job['company']['name']) | |
# Split location information into City and State | |
flag = 0 | |
if 'locationDescription' in job: | |
location = job['locationDescription'] | |
flag = location.find(',') | |
if flag > 0: | |
city = location[0:flag] | |
state = location[flag+2:len(location)] | |
if flag < 0: | |
city = location | |
state = '' | |
term.append(city) | |
term.append(state) | |
job_list.append(term) | |
# Retrieve job titles with LinkedIn Get Job API and job IDs | |
def get_job_title(job_id): | |
# Send request to LinkedIn server using OAuth 2.0 based authentication | |
url = 'https://api.linkedin.com/v1/jobs/' + str(job_id) + '?oauth2_access_token=AQVIHZX39PPbvEC9mPzDPTVze3zuZvDp4BFGn9tGfnvb3GKXmgS_AKCRNT_y85nyb8f6HAWLIHIruJM5XVKGo5dAy7cbn5rEq0Zwt63D2D1BnpX-otZVvHvmxL8uJnfQDDeuZuL6sgVF8avXK88PAPJsY7i-qtqqSi35oBNSqWR_sy4oRwc' | |
# read url | |
file = urllib2.urlopen(url) | |
data = file.read() | |
file.close() | |
# Parse returned xml file | |
data = xmltodict.parse(data) | |
# Get job title corresponding to each job ID | |
Job_title = data['job']['position']['title'] | |
return Job_title | |
for job in job_list: | |
job.append(get_job_title(job[0])) | |
# Create list containing required information | |
Job_Title = [x[4] for x in job_list] | |
Company = [x[1] for x in job_list] | |
City = [x[2] for x in job_list] | |
State = [x[3] for x in job_list] | |
Link = ['http://www.linkedin.com/jobs?viewJob=&jobId='+str(x[0]) for x in job_list] | |
Job_Dict = { | |
'Title': Job_Title, | |
'Company': Company, | |
'City': City, | |
'State': State, | |
'Link': Link | |
} | |
# Output as dataframe | |
Job_DF = pd.DataFrame(Job_Dict) | |
Job_DF_DD = Job_DF.drop_duplicates() | |
# Part 5 Combine all parts together and output as csv | |
#------------------------------------------------------------------------------------------------- | |
#define the Monster URL of interest and actually run Monster code | |
monsterURL = 'http://jobsearch.monster.com/search/?q='+seach_url_monster+'&pg=' | |
monsterJobs = getMonsterJobs(monsterURL) | |
print "Monster jobs: " + str(len(monsterJobs)) | |
monsterJobs.to_csv('monsterJobs1.csv') | |
#actually run Indeed code | |
indeedJobs = getIndeedJobs() | |
print "Indeed jobs: " + str(len(indeedJobs)) | |
indeedJobs.to_csv('indeedJobs1.csv') | |
print "CareerBuilder jobs: " + str(len(CB_FinalDF)) | |
print "LinkedIn jobs: " + str(len(Job_DF_DD)) | |
allJobs = pd.concat([monsterJobs,indeedJobs,CB_FinalDF,Job_DF_DD], keys=['monster.com','indeed.com','careerbuilder.com','linkedin.com']) | |
print "Monster, Indeed, CareerBuilder, and LinkedIn jobs: " + str(len(allJobs)) | |
allJobsRed=allJobs.drop_duplicates(cols=('Title','Company','City','State')) | |
print "Monster, Indeed, CareerBuilder, and LinkedIn jobs (no duplicates): " + str(len(allJobsRed)) | |
print "Duplicates removed: " + str(len(allJobs)-len(allJobsRed)) | |
allJobsRed.to_csv('allJobs.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment