Skip to content

Instantly share code, notes, and snippets.

@hadoan
Created June 2, 2020 07:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hadoan/3c4ae5bdd1bd61908a1150e37e5aa022 to your computer and use it in GitHub Desktop.
Save hadoan/3c4ae5bdd1bd61908a1150e37e5aa022 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import scrapy
import logging
from datetime import date, datetime
import sys
logfile = "logs/log"+date.today().strftime("%Y%m%d")+'.txt'
logging.basicConfig(filename=logfile, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
class StackoverflowSpider(scrapy.Spider):
name = 'stackoverflow'
allowed_domains = ['stackoverflow.com']
start_urls = ['https://stackoverflow.com/jobs?id=395682&v=true']
def parse(self, response):
listResults = response.xpath('//div[@class="listResults"]')
companyLogoUrls = listResults.xpath("//div[@class='grid']/img/@src")
for logoUrl in companyLogoUrls:
logging.info(logoUrl.get())
jobs = listResults.xpath("//div[@class='grid']/div/h2/a")
for job in jobs:
logging.info(job.xpath('@href').get())
logging.info(job.xpath('text()').get())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment