Skip to content

Instantly share code, notes, and snippets.

@scrapehero
Last active December 20, 2017 10:48
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save scrapehero/bac796ab084814b986b1e549b2fa8600 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
from requests import get
from re import sub
from json import loads,dump
from re import findall,sub
from unicodecsv import QUOTE_ALL,DictWriter
totalJobs = []
def getPage(url):
"""
Method to handle page request
"""
headers = {
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate',
'Accept': '*/*',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'
}
response = get(url,verify=False,headers=headers)
pageStatus = response.status_code
while pageStatus!=200:
print "Retrying page ",url
response = get(url,verify=False,headers=headers)
pageStatus = response.status_code
cleaned_html = sub('\s\s+','',response.content.replace("<!--","").replace("-->",""))
return cleaned_html
def getjobs(companyName,location):
"""
Need to get locationId for respective location to get the job list
"""
url = 'https://www.linkedin.com/ta/geo?query='+location.replace(' ','%20')+'&orig=jserp'
html = getPage(url)
jsonData = loads(html)
try:
locationId = jsonData['resultList'][0]['id']
url = 'https://www.linkedin.com/jobs/searchRefresh?keywords='+companyName.replace(" ","%20")+'&refreshType=fullpage&locationId='+locationId+'&trk=jobs_jserp_search_button_execute&searchOrigin=JSERP'
html= getPage(url)
jsonData = loads(html)
for job in jsonData['decoratedJobPostingsModule']['elements']:
html = getPage(job['viewJobCanonicalUrl'])
soup = BeautifulSoup(html)
element = soup.find('code',attrs={'id':'decoratedJobPostingModule'})
jsonData = loads(element.string)
if jsonData:
postedTime = jsonData['decoratedJobPosting']['formattedPostedDaysAgo']
jobDescription = jsonData['decoratedJobPosting']['jobPosting']['description']['rawText']
jobFunctions = jsonData['decoratedJobPosting']['formattedJobFunctions']
jobFunctions = ",".join(jobFunctions) if jobFunctions else None
employmentType = jsonData['decoratedJobPosting']['formattedEmploymentStatus']
experience = jsonData['decoratedJobPosting']['formattedExperience']
location = jsonData['decoratedJobPosting']['formattedLocation']
jobTitle = jsonData['decoratedJobPosting']['jobPosting']['title']
companyName = jsonData['decoratedJobPosting']['jobPosting']['companyName']
industries = jsonData['decoratedJobPosting']['formattedIndustries']
industries = ",".join(industries) if industries else None
element = soup.find('code',attrs={'id':'topCardV2Module'})
jsonData = loads(element.string)
totalViews = jsonData['viewCount']
item = {
"postedTime":postedTime,
"jobDescription":jobDescription,
"jobFunctions":jobFunctions,
"employmentType":employmentType,
"experience":experience,
"location":location,
"jobTitle":jobTitle,
"companyName":companyName,
"industries":industries,
"totalViews":totalViews,
"url":url
}
totalJobs.append(item)
except:
pass
if __name__ == '__main__':
""" Provide the location and companyName """
location = "New York"
companyName = "Airbnb"
getjobs(companyName,location)
print "Total jobs got ",len(totalJobs)
file = open('finalData.csv','wb')
fields = [
'jobTitle','companyName','location','postedTime','totalViews',
'jobDescription','industries','employmentType','experience','employmentType',
'jobFunctions','url'
]
csvfile = DictWriter(file,fieldnames=fields,quoting=QUOTE_ALL,encoding="utf-8")
csvfile.writeheader()
for i in totalJobs:
csvfile.writerow(i)
file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment