Skip to content

Instantly share code, notes, and snippets.

@jluczak
Created August 21, 2017 11:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jluczak/dd179ba6715b0d0cfe198ce7e7668583 to your computer and use it in GitHub Desktop.
Save jluczak/dd179ba6715b0d0cfe198ce7e7668583 to your computer and use it in GitHub Desktop.
Scrap companies from Crossweb
import scrapy
class CrosswebItem(scrapy.Item):
name=scrapy.Field()
city=scrapy.Field()
topics=scrapy.Field()
file_urls=scrapy.Field()
files=scrapy.Field()
description=scrapy.Field()
# -*- coding: utf-8 -*-
BOT_NAME = 'crossweb'
SPIDER_MODULES = ['crossweb.spiders']
NEWSPIDER_MODULE = 'crossweb.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'crossweb.pipelines.CrosswebPipeline': 1,
}
FILES_STORE = 'crossweb_photos'
import scrapy
from crossweb.items import CrosswebItem
class ResearchSpider(scrapy.Spider):
start_urls = [
'https://crossweb.pl/job/',
'https://crossweb.pl/job/?page=2'
]
name = 'crossweb'
def parse(self, response):
for talk in response.css('.company a'):
href = talk.css('a::attr(href)').extract_first()
full_url = response.urljoin(href)
yield scrapy.Request(full_url, callback=self.parse_book)
def parse_book(self, response):
name = response.css('#container > h1::text').extract_first()
city = response.css('#content > section:nth-child(3) > div.param > div:nth-child(1) > span::text').extract_first()
description = response.css('#eventText > p::text').extract_first()
topics = response.css('#content > section:nth-child(5) > div.param > div:nth-child(1) > span::text').extract()
file_urls = response.css('#container > div.company-photo > img:nth-child(2)::attr(src)').extract_first()
yield CrosswebItem(name=name, city=city, description=description, topics=topics,file_urls=file_urls)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment