Skip to content

Instantly share code, notes, and snippets.

Last active April 3, 2019 11:17
Show Gist options
  • Save candidosales/1a1044a6236e64323222cce7db90404f to your computer and use it in GitHub Desktop.
Save candidosales/1a1044a6236e64323222cce7db90404f to your computer and use it in GitHub Desktop.
Tutorial to crawler with scrapy



Project Setup

First, let's create our new project, let's call it

poetry new your-project

Add dependency

cd your-project 
poetry add scrapy

Install virtualenv

pip install virtualenv

Configure virtualenv

virtualenv --python='/usr/local/bin/python3' venv

Activate environment Python 3

source venv/bin/activate

Script to crawler one page

import scrapy

class BrickSetSpider(scrapy.Spider):
    name = 'jusnavigandi'
    start_urls = ['']

    custom_settings = {
        'FEED_EXPORT_ENCODING': 'utf-8',

    def parse(self, response):
        SET_SELECTOR = 'div#publication'
        for brickset in response.css(SET_SELECTOR):
            TITLE_SELECTOR = 'h1.titulo.entry-title::text'
            AUTHORS_SELECTOR = 'p.autor a::text'
            CATEGORIES_SELECTOR = '.catline ul li a::text'
            ABSTRACT_SELECTOR = 'div.abstract p::text'
            CONTENT_SELECTOR = 'div.jtext ::text'

            yield {
                'title': self.clear_content(brickset.css(TITLE_SELECTOR).extract_first()),
                'authors': brickset.css(AUTHORS_SELECTOR).extract(),
                'abstract': self.clear_content(brickset.css(ABSTRACT_SELECTOR).extract_first()),
                'categories': brickset.css(CATEGORIES_SELECTOR).extract(),
                'content': self.clear_content(' '.join(brickset.css(CONTENT_SELECTOR).extract()))

    def clear_content(self, content):
        if (content):
            return content.replace('\n', '').replace('\r', '').replace('\t', '').replace(u'\xa0', u' ').strip()

Execute crawler

scrapy runspider -o result.json

Script to crawler multiples pages

import scrapy
from scrapy.http.request import Request

class BrickSetSpider(scrapy.Spider):
    name = 'jusnavigandi'    

    # start_url = ''
    # start_urls = get_urls([''])

    custom_settings = {
        'FEED_EXPORT_ENCODING': 'utf-8',
        'DOWNLOAD_DELAY': 1,

    def start_requests(self):
        urls = self.get_urls([''])
        for url in urls:
            yield Request(url, self.parse)

    def get_urls(self, urls):
        for x in range(1, 69):
            if (x > 1):
                urls.append('' + str(x))
        return urls

    def parse(self, response):

        for next_page in response.css('div.texts > ul > li.item > article > h4 a::attr(href)').extract():
            if next_page is not None:
                next_page = response.urljoin(next_page)
                yield scrapy.Request(next_page, callback=self.parse, dont_filter=True)

        SET_SELECTOR = 'div#publication'
        for brickset in response.css(SET_SELECTOR):
            TITLE_SELECTOR = 'h1.titulo.entry-title::text'
            AUTHORS_SELECTOR = 'p.autor a::text'
            CATEGORIES_SELECTOR = '.catline ul li a::text'
            ABSTRACT_SELECTOR = 'div.abstract p::text'
            CONTENT_SELECTOR = 'div.jtext ::text'

            yield {
                'title': self.clear_content(brickset.css(TITLE_SELECTOR).extract_first()),
                'authors': brickset.css(AUTHORS_SELECTOR).extract(),
                'abstract': self.clear_content(brickset.css(ABSTRACT_SELECTOR).extract_first()),
                'categories': brickset.css(CATEGORIES_SELECTOR).extract(),
                'content': self.clear_content(' '.join(brickset.css(CONTENT_SELECTOR).extract()))

    def clear_content(self, content):
        if (content):
            return content.replace('\n', '').replace('\r', '').replace('\t', '').replace(u'\xa0', u' ').strip()

Execute crawler

scrapy runspider -o result.json


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment