majacaci00/indeed_spider.py

## indeed_spider.py

## scrapy crawl indeed_base -o indeed_raw.json
# -*- coding: utf-8 -*-from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

import scrapy
from indeed.items import IndeedItem
from scrapy.spiders import CrawlSpider, Rule

from bs4 import BeautifulSoup


class IndeedSpider(CrawlSpider):

    name                    =   "indeed_base"
    allowed_domains         =   ["indeed.com", "indeed.co.uk", "de.indeed.com", "indeed.com.br", "indeed.es", "indeex.hk"]
    handle_httpstatus_list  =   [301, 302]

    start_urls = [

        # San Francisco
        #"http://www.indeed.com/jobs?q=san+scientist&l=San+Francisco%2C+CA",

        # New York
        #'http://www.indeed.com/jobs?q=data+science&l=New+York%2C+NY',

        # London
        #"http://www.indeed.co.uk/data-scientist-jobs-in-london",

        # Minneapolis
        # "http://www.indeed.com/jobs?q=data+scientist&l=Minneapolis%2C+MN",

        # Texas
        # "http://www.indeed.com/jobs?q=data+scientist&l=Texas",

        # Illinois
        # "http://www.indeed.com/jobs?q=data+scientist&l=Illinois",

        # Massachusetts
        # "http://www.indeed.com/jobs?q=data+scientist&l=Massachusetts",

        # Berlin
        # "http://de.indeed.com/Jobs?q=Data+Science&l=Berlin",

        # Brazil
        # "http://www.indeed.com.br/empregos?q=data+science&l=",

        # Spain
        # "http://www.indeed.es/ofertas?q=data+science&l=",

        # Hong Kong
        # "http://www.indeed.hk/jobs?q=data+science&l=",


    ]

    rules = (
      #Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse_search", follow = True),
      Rule(LinkExtractor(deny=('account/login'), allow=(), restrict_xpaths=("//a[contains(@href, 'start')]",)), callback="parse_indeed_results", follow = True),
    )


    def parse_indeed_results(self, response):

        # To extract elements, add them here
        xpaths = {
            "title"    :  './/a[@data-tn-element="jobTitle"]/@title',
            "summary"  :  './/span[@class="summary"]'
        }

        for sel in response.xpath("//td[@id='resultsCol']/div"):

            item                    =   IndeedItem()
            total_result_extracted  =   False

            # Run xpath queries in sequence
            for key, xpath_query in xpaths.items():

                # Run the xpath query against the target element
                extracted = sel.xpath(xpath_query).extract()

                # Make sure it found something
                if len(extracted) > 0:

                    # Because there are nested elements represending the summary (multiple spans), we can use Beautfulsoup
                    # to pull out all everything as text without having to do complicated parsing methods or joining
                    if key == "summary":
                        soup      =  BeautifulSoup(extracted[0], 'html.parser')
                        item[key] =  soup.get_text()
                    else:

                        item[key] = extracted[0]

                    # We have to have at least one extracted item to qualify the row
                    total_result_extracted = True

            # If we have at least one item extracted per result, we put it into the model
            if total_result_extracted:
                yield item

	## scrapy crawl indeed_base -o indeed_raw.json
	# -- coding: utf-8 --from scrapy.spiders import CrawlSpider, Rule
	from scrapy.linkextractors import LinkExtractor

	import scrapy
	from indeed.items import IndeedItem
	from scrapy.spiders import CrawlSpider, Rule

	from bs4 import BeautifulSoup


	class IndeedSpider(CrawlSpider):

	name = "indeed_base"
	allowed_domains = ["indeed.com", "indeed.co.uk", "de.indeed.com", "indeed.com.br", "indeed.es", "indeex.hk"]
	handle_httpstatus_list = [301, 302]

	start_urls = [

	# San Francisco
	#"http://www.indeed.com/jobs?q=san+scientist&l=San+Francisco%2C+CA",

	# New York
	#'http://www.indeed.com/jobs?q=data+science&l=New+York%2C+NY',

	# London
	#"http://www.indeed.co.uk/data-scientist-jobs-in-london",

	# Minneapolis
	# "http://www.indeed.com/jobs?q=data+scientist&l=Minneapolis%2C+MN",

	# Texas
	# "http://www.indeed.com/jobs?q=data+scientist&l=Texas",

	# Illinois
	# "http://www.indeed.com/jobs?q=data+scientist&l=Illinois",

	# Massachusetts
	# "http://www.indeed.com/jobs?q=data+scientist&l=Massachusetts",

	# Berlin
	# "http://de.indeed.com/Jobs?q=Data+Science&l=Berlin",

	# Brazil
	# "http://www.indeed.com.br/empregos?q=data+science&l=",

	# Spain
	# "http://www.indeed.es/ofertas?q=data+science&l=",

	# Hong Kong
	# "http://www.indeed.hk/jobs?q=data+science&l=",


	]

	rules = (
	#Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse_search", follow = True),
	Rule(LinkExtractor(deny=('account/login'), allow=(), restrict_xpaths=("//a[contains(@href, 'start')]",)), callback="parse_indeed_results", follow = True),
	)


	def parse_indeed_results(self, response):

	# To extract elements, add them here
	xpaths = {
	"title" : './/a[@data-tn-element="jobTitle"]/@title',
	"summary" : './/span[@class="summary"]'
	}

	for sel in response.xpath("//td[@id='resultsCol']/div"):

	item = IndeedItem()
	total_result_extracted = False

	# Run xpath queries in sequence
	for key, xpath_query in xpaths.items():

	# Run the xpath query against the target element
	extracted = sel.xpath(xpath_query).extract()

	# Make sure it found something
	if len(extracted) > 0:

	# Because there are nested elements represending the summary (multiple spans), we can use Beautfulsoup
	# to pull out all everything as text without having to do complicated parsing methods or joining
	if key == "summary":
	soup = BeautifulSoup(extracted[0], 'html.parser')
	item[key] = soup.get_text()
	else:

	item[key] = extracted[0]

	# We have to have at least one extracted item to qualify the row
	total_result_extracted = True

	# If we have at least one item extracted per result, we put it into the model
	if total_result_extracted:
	yield item