Skip to content

Instantly share code, notes, and snippets.

@majacaci00
Created November 2, 2016 06:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save majacaci00/5ebf38be52e8e744ee473c58b7a722d5 to your computer and use it in GitHub Desktop.
Save majacaci00/5ebf38be52e8e744ee473c58b7a722d5 to your computer and use it in GitHub Desktop.
In class lab use this file in your "spiders" folder of a scrapy project. Make sure you set your "DOWNLOAD_DELAY" to 4 seconds while you're testing your spider. Remove the delay once you've debugged your spider and then let it fly. Please try to avoid running your crawling processes at full speed more than necessary!
## scrapy crawl indeed_base -o indeed_raw.json
# -*- coding: utf-8 -*-from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import scrapy
from indeed.items import IndeedItem
from scrapy.spiders import CrawlSpider, Rule
from bs4 import BeautifulSoup
class IndeedSpider(CrawlSpider):
name = "indeed_base"
allowed_domains = ["indeed.com", "indeed.co.uk", "de.indeed.com", "indeed.com.br", "indeed.es", "indeex.hk"]
handle_httpstatus_list = [301, 302]
start_urls = [
# San Francisco
#"http://www.indeed.com/jobs?q=san+scientist&l=San+Francisco%2C+CA",
# New York
#'http://www.indeed.com/jobs?q=data+science&l=New+York%2C+NY',
# London
#"http://www.indeed.co.uk/data-scientist-jobs-in-london",
# Minneapolis
# "http://www.indeed.com/jobs?q=data+scientist&l=Minneapolis%2C+MN",
# Texas
# "http://www.indeed.com/jobs?q=data+scientist&l=Texas",
# Illinois
# "http://www.indeed.com/jobs?q=data+scientist&l=Illinois",
# Massachusetts
# "http://www.indeed.com/jobs?q=data+scientist&l=Massachusetts",
# Berlin
# "http://de.indeed.com/Jobs?q=Data+Science&l=Berlin",
# Brazil
# "http://www.indeed.com.br/empregos?q=data+science&l=",
# Spain
# "http://www.indeed.es/ofertas?q=data+science&l=",
# Hong Kong
# "http://www.indeed.hk/jobs?q=data+science&l=",
]
rules = (
#Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[@class="button next"]',)), callback="parse_search", follow = True),
Rule(LinkExtractor(deny=('account/login'), allow=(), restrict_xpaths=("//a[contains(@href, 'start')]",)), callback="parse_indeed_results", follow = True),
)
def parse_indeed_results(self, response):
# To extract elements, add them here
xpaths = {
"title" : './/a[@data-tn-element="jobTitle"]/@title',
"summary" : './/span[@class="summary"]'
}
for sel in response.xpath("//td[@id='resultsCol']/div"):
item = IndeedItem()
total_result_extracted = False
# Run xpath queries in sequence
for key, xpath_query in xpaths.items():
# Run the xpath query against the target element
extracted = sel.xpath(xpath_query).extract()
# Make sure it found something
if len(extracted) > 0:
# Because there are nested elements represending the summary (multiple spans), we can use Beautfulsoup
# to pull out all everything as text without having to do complicated parsing methods or joining
if key == "summary":
soup = BeautifulSoup(extracted[0], 'html.parser')
item[key] = soup.get_text()
else:
item[key] = extracted[0]
# We have to have at least one extracted item to qualify the row
total_result_extracted = True
# If we have at least one item extracted per result, we put it into the model
if total_result_extracted:
yield item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment