jkarnows/scrapeIndeed.py

## scrapeIndeed.py
"""
Code modified from https://jessesw.com/Data-Science-Skills/
Jeremy Karnowski August 30, 2015
"""
from bs4 import BeautifulSoup                   # For HTML parsing
import urllib2                                  # Website connections
import tldextract                               # Extracts domain information
import re                                       # Regular expressions
from time import sleep                          # To prevent overwhelming the server between connections

def cleanIndeedWebsite(website):

    # Check if site is from indeed.com (or formatting will be different)
    redirect = urllib2.urlopen(website)
    redirect_domain = tldextract.extract(redirect.geturl()).domain

    # If from indee.com, grab the cleaned up job posting
    if redirect_domain == 'indeed':

        site = redirect.read()

        soup_obj = BeautifulSoup(site)
        for item in soup_obj(["script",
                              "style",
                              "head",
                              "noscript",
                              "separator_top"]):
                item.extract()

        div_ids = ["g_nav", "footerWrapper","indeed_apply", "emailApplyWrapper"]
        for item in div_ids:
            it = soup_obj.find("div", {"id": item})
            it.extract()

        div_classes = ["result-link-bar-container result-link-bar-viewjob",
                       "tab-container"]
        for item in div_classes:
            it = soup_obj.find("div", {"class": item})
            it.extract()

        table_classes = ["lnav"]
        for item in table_classes:
            it = soup_obj.find("table", {"class": item})
            it.extract()

        soup_obj = soup_obj.decode('unicode_escape').encode('ascii', 'ignore')

        return soup_obj
    else:
        return

def indeedSearch(city = None, state = None, final_job = 'data scientist', save_dir = None):
    '''
    This function will take a desired city/state and look for all new job postings
    on Indeed.com. It will crawl all of the job postings and keep track of how many
    use a preset list of typical data science skills. The final percentage for each skill
    is then displayed at the end of the collation.

    Inputs: The location's city and state and desired job. These are optional. If no
    city/state is input, the function will assume a national search (this can take a
    while!!!). Input the city/state as strings, such as skills_info('Chicago', 'IL').
    Use a two letter abbreviation for the state. Searching for a job requires the text
    to have plus signs between words. For example, a data scientist search would be
    "data scientist" and the search term would be "data+scientist".

    Output: Saved files representing the text from the indeed.com job postings
    '''

    # final_job = final_job.replace(' ', '+') # commented to get effect I want at moment

    # Make sure the city specified works properly if it has more than one word (such as San Francisco)
    if city is not None:
        final_city = city.split()
        final_city = '+'.join(word for word in final_city)
        final_site_list = ['http://www.indeed.com/jobs?q=%22', final_job, '%22&l=', final_city,
                    '%2C+', state] # Join all of our strings together so that indeed will search correctly
    else:
        final_site_list = ['http://www.indeed.com/jobs?q="', final_job, '"']

    final_site = ''.join(final_site_list) # Merge the html address together into one string


    base_url = 'http://www.indeed.com'


    try:
        html = urllib2.urlopen(final_site).read() # Open up the front page of our search first
    except:
        'That city/state combination did not have any jobs. Exiting . . .' # In case the city is invalid
        return
    soup = BeautifulSoup(html) # Get the html from the first page

    # Now find out how many jobs there were

    num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8') # Now extract the total number of jobs found
                                                                        # The 'searchCount' object has this

    job_numbers = re.findall('\d+', num_jobs_area) # Extract the total jobs found from the search result

    print job_numbers

    if len(job_numbers) > 3: # Have a total number of jobs greater than 1000
        total_num_jobs = (int(job_numbers[2])*1000) + int(job_numbers[3])
    else:
        total_num_jobs = int(job_numbers[2])

    city_title = city
    if city is None:
        city_title = 'Nationwide'

    print 'There were', total_num_jobs, 'jobs found,', city_title # Display how many jobs were found

    num_pages = total_num_jobs/10 # This will be how we know the number of times we need to iterate over each new
                                      # search result page
    job_descriptions = [] # Store all our descriptions in this list

#     for i in xrange(1,num_pages+1): # Loop through all of our search result pages
    for i in xrange(1,100): # Just loop through the first 100 pages. After this, posts get old and aren't from indeed.com
        print 'Getting page', i
        start_num = str(i*10) # Assign the multiplier of 10 to view the pages we want
        current_page = ''.join([final_site, '&start=', start_num])
        # Now that we can view the correct 10 job returns, start collecting the text samples from each

        html_page = urllib2.urlopen(current_page).read() # Get the page

        page_obj = BeautifulSoup(html_page) # Locate all of the job links
        job_link_area = page_obj.find(id = 'resultsCol') # The center column on the page where the job postings exist

        job_URLS = [base_url + link.get('href') for link in job_link_area.find_all('a')] # Get the URLS for the jobs

        job_URLS = filter(lambda x:'clk' in x, job_URLS) # Now get just the job related URLS

        for j in xrange(0,len(job_URLS)):
            try:
                final_description = cleanIndeedWebsite(job_URLS[j])
            except:
                final_description = None

            if final_description:
                # For each job, save the text on our machine
                tmp_file = open(save_dir + city + "_" + str(i) + "_" + str(j) + '.txt','w')
                tmp_file.write(final_description)
                tmp_file.close()

            sleep(1) # So that we don't be jerks. If you have a very fast internet connection you could hit the server a lot!

    print 'Done with collecting the job postings!'
    print 'There were', len(job_descriptions), 'jobs successfully found.'
	"""
	Code modified from https://jessesw.com/Data-Science-Skills/
	Jeremy Karnowski August 30, 2015
	"""
	from bs4 import BeautifulSoup # For HTML parsing
	import urllib2 # Website connections
	import tldextract # Extracts domain information
	import re # Regular expressions
	from time import sleep # To prevent overwhelming the server between connections

	def cleanIndeedWebsite(website):

	# Check if site is from indeed.com (or formatting will be different)
	redirect = urllib2.urlopen(website)
	redirect_domain = tldextract.extract(redirect.geturl()).domain

	# If from indee.com, grab the cleaned up job posting
	if redirect_domain == 'indeed':

	site = redirect.read()

	soup_obj = BeautifulSoup(site)
	for item in soup_obj(["script",
	"style",
	"head",
	"noscript",
	"separator_top"]):
	item.extract()

	div_ids = ["g_nav", "footerWrapper","indeed_apply", "emailApplyWrapper"]
	for item in div_ids:
	it = soup_obj.find("div", {"id": item})
	it.extract()

	div_classes = ["result-link-bar-container result-link-bar-viewjob",
	"tab-container"]
	for item in div_classes:
	it = soup_obj.find("div", {"class": item})
	it.extract()

	table_classes = ["lnav"]
	for item in table_classes:
	it = soup_obj.find("table", {"class": item})
	it.extract()

	soup_obj = soup_obj.decode('unicode_escape').encode('ascii', 'ignore')

	return soup_obj
	else:
	return

	def indeedSearch(city = None, state = None, final_job = 'data scientist', save_dir = None):
	'''
	This function will take a desired city/state and look for all new job postings
	on Indeed.com. It will crawl all of the job postings and keep track of how many
	use a preset list of typical data science skills. The final percentage for each skill
	is then displayed at the end of the collation.

	Inputs: The location's city and state and desired job. These are optional. If no
	city/state is input, the function will assume a national search (this can take a
	while!!!). Input the city/state as strings, such as skills_info('Chicago', 'IL').
	Use a two letter abbreviation for the state. Searching for a job requires the text
	to have plus signs between words. For example, a data scientist search would be
	"data scientist" and the search term would be "data+scientist".

	Output: Saved files representing the text from the indeed.com job postings
	'''

	# final_job = final_job.replace(' ', '+') # commented to get effect I want at moment

	# Make sure the city specified works properly if it has more than one word (such as San Francisco)
	if city is not None:
	final_city = city.split()
	final_city = '+'.join(word for word in final_city)
	final_site_list = ['http://www.indeed.com/jobs?q=%22', final_job, '%22&l=', final_city,
	'%2C+', state] # Join all of our strings together so that indeed will search correctly
	else:
	final_site_list = ['http://www.indeed.com/jobs?q="', final_job, '"']

	final_site = ''.join(final_site_list) # Merge the html address together into one string


	base_url = 'http://www.indeed.com'


	try:
	html = urllib2.urlopen(final_site).read() # Open up the front page of our search first
	except:
	'That city/state combination did not have any jobs. Exiting . . .' # In case the city is invalid
	return
	soup = BeautifulSoup(html) # Get the html from the first page

	# Now find out how many jobs there were

	num_jobs_area = soup.find(id = 'searchCount').string.encode('utf-8') # Now extract the total number of jobs found
	# The 'searchCount' object has this

	job_numbers = re.findall('\d+', num_jobs_area) # Extract the total jobs found from the search result

	print job_numbers

	if len(job_numbers) > 3: # Have a total number of jobs greater than 1000
	total_num_jobs = (int(job_numbers[2])*1000) + int(job_numbers[3])
	else:
	total_num_jobs = int(job_numbers[2])

	city_title = city
	if city is None:
	city_title = 'Nationwide'

	print 'There were', total_num_jobs, 'jobs found,', city_title # Display how many jobs were found

	num_pages = total_num_jobs/10 # This will be how we know the number of times we need to iterate over each new
	# search result page
	job_descriptions = [] # Store all our descriptions in this list

	# for i in xrange(1,num_pages+1): # Loop through all of our search result pages
	for i in xrange(1,100): # Just loop through the first 100 pages. After this, posts get old and aren't from indeed.com
	print 'Getting page', i
	start_num = str(i*10) # Assign the multiplier of 10 to view the pages we want
	current_page = ''.join([final_site, '&start=', start_num])
	# Now that we can view the correct 10 job returns, start collecting the text samples from each

	html_page = urllib2.urlopen(current_page).read() # Get the page

	page_obj = BeautifulSoup(html_page) # Locate all of the job links
	job_link_area = page_obj.find(id = 'resultsCol') # The center column on the page where the job postings exist

	job_URLS = [base_url + link.get('href') for link in job_link_area.find_all('a')] # Get the URLS for the jobs

	job_URLS = filter(lambda x:'clk' in x, job_URLS) # Now get just the job related URLS

	for j in xrange(0,len(job_URLS)):
	try:
	final_description = cleanIndeedWebsite(job_URLS[j])
	except:
	final_description = None

	if final_description:
	# For each job, save the text on our machine
	tmp_file = open(save_dir + city + "_" + str(i) + "_" + str(j) + '.txt','w')
	tmp_file.write(final_description)
	tmp_file.close()

	sleep(1) # So that we don't be jerks. If you have a very fast internet connection you could hit the server a lot!

	print 'Done with collecting the job postings!'
	print 'There were', len(job_descriptions), 'jobs successfully found.'