Skip to content

Instantly share code, notes, and snippets.

@jkarnows
Created August 31, 2015 03:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save jkarnows/5a506876ba9340aac08c to your computer and use it in GitHub Desktop.
Save jkarnows/5a506876ba9340aac08c to your computer and use it in GitHub Desktop.
"""
Code modified from https://jessesw.com/Data-Science-Skills/
Jeremy Karnowski August 30, 2015
"""
from bs4 import BeautifulSoup # For HTML parsing
import urllib2 # Website connections
import tldextract # Extracts domain information
import re # Regular expressions
from time import sleep # To prevent overwhelming the server between connections
def cleanIndeedWebsite(website):
# Check if site is from indeed.com (or formatting will be different)
redirect = urllib2.urlopen(website)
redirect_domain = tldextract.extract(redirect.geturl()).domain
# If from indee.com, grab the cleaned up job posting
if redirect_domain == 'indeed':
site = redirect.read()
soup_obj = BeautifulSoup(site)
for item in soup_obj(["script",
"style",
"head",
"noscript",
"separator_top"]):
item.extract()
div_ids = ["g_nav", "footerWrapper","indeed_apply", "emailApplyWrapper"]
for item in div_ids:
it = soup_obj.find("div", {"id": item})
it.extract()
div_classes = ["result-link-bar-container result-link-bar-viewjob",
"tab-container"]
for item in div_classes:
it = soup_obj.find("div", {"class": item})
it.extract()
table_classes = ["lnav"]
for item in table_classes:
it = soup_obj.find("table", {"class": item})
it.extract()
soup_obj = soup_obj.decode('unicode_escape').encode('ascii', 'ignore')
return soup_obj
else:
return
def indeedSearch(city = None, state = None, final_job = 'data scientist', save_dir = None):
'''
This function will take a desired city/state and look for all new job postings
on Indeed.com. It will crawl all of the job postings and keep track of how many
use a preset list of typical data science skills. The final percentage for each skill
is then displayed at the end of the collation.
Inputs: The location's city and state and desired job. These are optional. If no
city/state is input, the function will assume a national search (this can take a
while!!!). Input the city/state as strings, such as skills_info('Chicago', 'IL').
Use a two letter abbreviation for the state. Searching for a job requires the text
to have plus signs between words. For example, a data scientist search would be
"data scientist" and the search term would be "data+scientist".
Output: Saved files representing the text from the indeed.com job postings
'''
# final_job = final_job.replace(' ', '+') # commented to get effect I want at moment
# Make sure the city specified works properly if it has more than one word (such as San Francisco)
if city is not None:
final_city = city.split()
final_city = '+'.join(word for word in final_city)
final_site_list = ['http://www.indeed.com/jobs?q=%22', final_job, '%22&l=', final_city,
'%2C+', state] # Join all of our strings together so that indeed will search correctly
else:
final_site_list = ['http://www.indeed.com/jobs?q="', final_job, '"']
final_site = ''.join(final_site_list) # Merge the html address together into one string
base_url = 'http://www.indeed.com'
try:
html = urllib2.urlopen(final_site).read() # Open up the front page of our search first
except:
'That city/state combination did not have any jobs. Exiting . . .' # In case the city is invalid
return
soup = BeautifulSoup(html) # Get the html from the first page
# Now find out how many jobs there were
num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8') # Now extract the total number of jobs found
# The 'searchCount' object has this
job_numbers = re.findall('\d+', num_jobs_area) # Extract the total jobs found from the search result
print job_numbers
if len(job_numbers) > 3: # Have a total number of jobs greater than 1000
total_num_jobs = (int(job_numbers[2])*1000) + int(job_numbers[3])
else:
total_num_jobs = int(job_numbers[2])
city_title = city
if city is None:
city_title = 'Nationwide'
print 'There were', total_num_jobs, 'jobs found,', city_title # Display how many jobs were found
num_pages = total_num_jobs/10 # This will be how we know the number of times we need to iterate over each new
# search result page
job_descriptions = [] # Store all our descriptions in this list
# for i in xrange(1,num_pages+1): # Loop through all of our search result pages
for i in xrange(1,100): # Just loop through the first 100 pages. After this, posts get old and aren't from indeed.com
print 'Getting page', i
start_num = str(i*10) # Assign the multiplier of 10 to view the pages we want
current_page = ''.join([final_site, '&start=', start_num])
# Now that we can view the correct 10 job returns, start collecting the text samples from each
html_page = urllib2.urlopen(current_page).read() # Get the page
page_obj = BeautifulSoup(html_page) # Locate all of the job links
job_link_area = page_obj.find(id = 'resultsCol') # The center column on the page where the job postings exist
job_URLS = [base_url + link.get('href') for link in job_link_area.find_all('a')] # Get the URLS for the jobs
job_URLS = filter(lambda x:'clk' in x, job_URLS) # Now get just the job related URLS
for j in xrange(0,len(job_URLS)):
try:
final_description = cleanIndeedWebsite(job_URLS[j])
except:
final_description = None
if final_description:
# For each job, save the text on our machine
tmp_file = open(save_dir + city + "_" + str(i) + "_" + str(j) + '.txt','w')
tmp_file.write(final_description)
tmp_file.close()
sleep(1) # So that we don't be jerks. If you have a very fast internet connection you could hit the server a lot!
print 'Done with collecting the job postings!'
print 'There were', len(job_descriptions), 'jobs successfully found.'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment