Skip to content

Instantly share code, notes, and snippets.

@candidosales
Last active April 3, 2019 11:17
Show Gist options
  • Save candidosales/1a1044a6236e64323222cce7db90404f to your computer and use it in GitHub Desktop.
Save candidosales/1a1044a6236e64323222cce7db90404f to your computer and use it in GitHub Desktop.
Tutorial to crawler with scrapy

Requirements

Install

Project Setup

First, let's create our new project, let's call it

poetry new your-project

Add dependency

cd your-project 
poetry add scrapy

Install virtualenv

pip install virtualenv

Configure virtualenv

virtualenv --python='/usr/local/bin/python3' venv

Activate environment Python 3

source venv/bin/activate

Script to crawler one page

crawler_item.py

import scrapy

class BrickSetSpider(scrapy.Spider):
    name = 'jusnavigandi'
    start_urls = ['https://jus.com.br/peticoes/70262/modelo-de-acao-de-obrigacao-de-fazer-com-dano-moral-dano-material-e-pedido-de-tutela-antecipada']

    custom_settings = {
        'FEED_EXPORT_ENCODING': 'utf-8',
    }

    def parse(self, response):
        SET_SELECTOR = 'div#publication'
        for brickset in response.css(SET_SELECTOR):
            TITLE_SELECTOR = 'h1.titulo.entry-title::text'
            AUTHORS_SELECTOR = 'p.autor a::text'
            CATEGORIES_SELECTOR = '.catline ul li a::text'
            ABSTRACT_SELECTOR = 'div.abstract p::text'
            CONTENT_SELECTOR = 'div.jtext ::text'

            yield {
                'title': self.clear_content(brickset.css(TITLE_SELECTOR).extract_first()),
                'authors': brickset.css(AUTHORS_SELECTOR).extract(),
                'abstract': self.clear_content(brickset.css(ABSTRACT_SELECTOR).extract_first()),
                'categories': brickset.css(CATEGORIES_SELECTOR).extract(),
                'content': self.clear_content(' '.join(brickset.css(CONTENT_SELECTOR).extract()))
            }

    def clear_content(self, content):
        if (content):
            return content.replace('\n', '').replace('\r', '').replace('\t', '').replace(u'\xa0', u' ').strip()

Execute crawler

scrapy runspider crawler_item.py -o result.json

Script to crawler multiples pages

crawler_page.py

import scrapy
from scrapy.http.request import Request

class BrickSetSpider(scrapy.Spider):
    name = 'jusnavigandi'    

    # start_url = 'https://jus.com.br/peticoes'
    # start_urls = get_urls(['https://jus.com.br/peticoes'])

    custom_settings = {
        'FEED_EXPORT_ENCODING': 'utf-8',
        'DOWNLOAD_DELAY': 1,
    }

    def start_requests(self):
        urls = self.get_urls(['https://jus.com.br/peticoes'])
        for url in urls:
            yield Request(url, self.parse)

    def get_urls(self, urls):
        for x in range(1, 69):
            if (x > 1):
                urls.append('https://jus.com.br/peticoes/p/' + str(x))
        return urls

    def parse(self, response):

        for next_page in response.css('div.texts > ul > li.item > article > h4 a::attr(href)').extract():
            if next_page is not None:
                next_page = response.urljoin(next_page)
                yield scrapy.Request(next_page, callback=self.parse, dont_filter=True)

        SET_SELECTOR = 'div#publication'
        for brickset in response.css(SET_SELECTOR):
            TITLE_SELECTOR = 'h1.titulo.entry-title::text'
            AUTHORS_SELECTOR = 'p.autor a::text'
            CATEGORIES_SELECTOR = '.catline ul li a::text'
            ABSTRACT_SELECTOR = 'div.abstract p::text'
            CONTENT_SELECTOR = 'div.jtext ::text'

            yield {
                'title': self.clear_content(brickset.css(TITLE_SELECTOR).extract_first()),
                'authors': brickset.css(AUTHORS_SELECTOR).extract(),
                'abstract': self.clear_content(brickset.css(ABSTRACT_SELECTOR).extract_first()),
                'categories': brickset.css(CATEGORIES_SELECTOR).extract(),
                'content': self.clear_content(' '.join(brickset.css(CONTENT_SELECTOR).extract()))
            }

    def clear_content(self, content):
        if (content):
            return content.replace('\n', '').replace('\r', '').replace('\t', '').replace(u'\xa0', u' ').strip()

Execute crawler

scrapy runspider crawler_page.py -o result.json

References

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment