Skip to content

Instantly share code, notes, and snippets.

@candidosales
Last active April 3, 2019 11:17
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Embed
What would you like to do?
Tutorial to crawler with scrapy

Requirements

Install

Project Setup

First, let's create our new project, let's call it

poetry new your-project

Add dependency

cd your-project 
poetry add scrapy

Install virtualenv

pip install virtualenv

Configure virtualenv

virtualenv --python='/usr/local/bin/python3' venv

Activate environment Python 3

source venv/bin/activate

Script to crawler one page

crawler_item.py

import scrapy

class BrickSetSpider(scrapy.Spider):
    name = 'jusnavigandi'
    start_urls = ['https://jus.com.br/peticoes/70262/modelo-de-acao-de-obrigacao-de-fazer-com-dano-moral-dano-material-e-pedido-de-tutela-antecipada']

    custom_settings = {
        'FEED_EXPORT_ENCODING': 'utf-8',
    }

    def parse(self, response):
        SET_SELECTOR = 'div#publication'
        for brickset in response.css(SET_SELECTOR):
            TITLE_SELECTOR = 'h1.titulo.entry-title::text'
            AUTHORS_SELECTOR = 'p.autor a::text'
            CATEGORIES_SELECTOR = '.catline ul li a::text'
            ABSTRACT_SELECTOR = 'div.abstract p::text'
            CONTENT_SELECTOR = 'div.jtext ::text'

            yield {
                'title': self.clear_content(brickset.css(TITLE_SELECTOR).extract_first()),
                'authors': brickset.css(AUTHORS_SELECTOR).extract(),
                'abstract': self.clear_content(brickset.css(ABSTRACT_SELECTOR).extract_first()),
                'categories': brickset.css(CATEGORIES_SELECTOR).extract(),
                'content': self.clear_content(' '.join(brickset.css(CONTENT_SELECTOR).extract()))
            }

    def clear_content(self, content):
        if (content):
            return content.replace('\n', '').replace('\r', '').replace('\t', '').replace(u'\xa0', u' ').strip()

Execute crawler

scrapy runspider crawler_item.py -o result.json

Script to crawler multiples pages

crawler_page.py

import scrapy
from scrapy.http.request import Request

class BrickSetSpider(scrapy.Spider):
    name = 'jusnavigandi'    

    # start_url = 'https://jus.com.br/peticoes'
    # start_urls = get_urls(['https://jus.com.br/peticoes'])

    custom_settings = {
        'FEED_EXPORT_ENCODING': 'utf-8',
        'DOWNLOAD_DELAY': 1,
    }

    def start_requests(self):
        urls = self.get_urls(['https://jus.com.br/peticoes'])
        for url in urls:
            yield Request(url, self.parse)

    def get_urls(self, urls):
        for x in range(1, 69):
            if (x > 1):
                urls.append('https://jus.com.br/peticoes/p/' + str(x))
        return urls

    def parse(self, response):

        for next_page in response.css('div.texts > ul > li.item > article > h4 a::attr(href)').extract():
            if next_page is not None:
                next_page = response.urljoin(next_page)
                yield scrapy.Request(next_page, callback=self.parse, dont_filter=True)

        SET_SELECTOR = 'div#publication'
        for brickset in response.css(SET_SELECTOR):
            TITLE_SELECTOR = 'h1.titulo.entry-title::text'
            AUTHORS_SELECTOR = 'p.autor a::text'
            CATEGORIES_SELECTOR = '.catline ul li a::text'
            ABSTRACT_SELECTOR = 'div.abstract p::text'
            CONTENT_SELECTOR = 'div.jtext ::text'

            yield {
                'title': self.clear_content(brickset.css(TITLE_SELECTOR).extract_first()),
                'authors': brickset.css(AUTHORS_SELECTOR).extract(),
                'abstract': self.clear_content(brickset.css(ABSTRACT_SELECTOR).extract_first()),
                'categories': brickset.css(CATEGORIES_SELECTOR).extract(),
                'content': self.clear_content(' '.join(brickset.css(CONTENT_SELECTOR).extract()))
            }

    def clear_content(self, content):
        if (content):
            return content.replace('\n', '').replace('\r', '').replace('\t', '').replace(u'\xa0', u' ').strip()

Execute crawler

scrapy runspider crawler_page.py -o result.json

References

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment