- Python 3.7+
- VirtualEnv
- Poetry (https://poetry.eustace.io/) // Package and dependency manager
First, let's create our new project, let's call it
poetry new your-project
Add dependency
cd your-project
poetry add scrapy
Install virtualenv
pip install virtualenv
Configure virtualenv
virtualenv --python='/usr/local/bin/python3' venv
Activate environment Python 3
source venv/bin/activate
crawler_item.py
import scrapy
class BrickSetSpider(scrapy.Spider):
name = 'jusnavigandi'
start_urls = ['https://jus.com.br/peticoes/70262/modelo-de-acao-de-obrigacao-de-fazer-com-dano-moral-dano-material-e-pedido-de-tutela-antecipada']
custom_settings = {
'FEED_EXPORT_ENCODING': 'utf-8',
}
def parse(self, response):
SET_SELECTOR = 'div#publication'
for brickset in response.css(SET_SELECTOR):
TITLE_SELECTOR = 'h1.titulo.entry-title::text'
AUTHORS_SELECTOR = 'p.autor a::text'
CATEGORIES_SELECTOR = '.catline ul li a::text'
ABSTRACT_SELECTOR = 'div.abstract p::text'
CONTENT_SELECTOR = 'div.jtext ::text'
yield {
'title': self.clear_content(brickset.css(TITLE_SELECTOR).extract_first()),
'authors': brickset.css(AUTHORS_SELECTOR).extract(),
'abstract': self.clear_content(brickset.css(ABSTRACT_SELECTOR).extract_first()),
'categories': brickset.css(CATEGORIES_SELECTOR).extract(),
'content': self.clear_content(' '.join(brickset.css(CONTENT_SELECTOR).extract()))
}
def clear_content(self, content):
if (content):
return content.replace('\n', '').replace('\r', '').replace('\t', '').replace(u'\xa0', u' ').strip()
Execute crawler
scrapy runspider crawler_item.py -o result.json
crawler_page.py
import scrapy
from scrapy.http.request import Request
class BrickSetSpider(scrapy.Spider):
name = 'jusnavigandi'
# start_url = 'https://jus.com.br/peticoes'
# start_urls = get_urls(['https://jus.com.br/peticoes'])
custom_settings = {
'FEED_EXPORT_ENCODING': 'utf-8',
'DOWNLOAD_DELAY': 1,
}
def start_requests(self):
urls = self.get_urls(['https://jus.com.br/peticoes'])
for url in urls:
yield Request(url, self.parse)
def get_urls(self, urls):
for x in range(1, 69):
if (x > 1):
urls.append('https://jus.com.br/peticoes/p/' + str(x))
return urls
def parse(self, response):
for next_page in response.css('div.texts > ul > li.item > article > h4 a::attr(href)').extract():
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse, dont_filter=True)
SET_SELECTOR = 'div#publication'
for brickset in response.css(SET_SELECTOR):
TITLE_SELECTOR = 'h1.titulo.entry-title::text'
AUTHORS_SELECTOR = 'p.autor a::text'
CATEGORIES_SELECTOR = '.catline ul li a::text'
ABSTRACT_SELECTOR = 'div.abstract p::text'
CONTENT_SELECTOR = 'div.jtext ::text'
yield {
'title': self.clear_content(brickset.css(TITLE_SELECTOR).extract_first()),
'authors': brickset.css(AUTHORS_SELECTOR).extract(),
'abstract': self.clear_content(brickset.css(ABSTRACT_SELECTOR).extract_first()),
'categories': brickset.css(CATEGORIES_SELECTOR).extract(),
'content': self.clear_content(' '.join(brickset.css(CONTENT_SELECTOR).extract()))
}
def clear_content(self, content):
if (content):
return content.replace('\n', '').replace('\r', '').replace('\t', '').replace(u'\xa0', u' ').strip()
Execute crawler
scrapy runspider crawler_page.py -o result.json
- [Crawler] - https://www.digitalocean.com/community/tutorials/como-fazer-crawling-em-uma-pagina-web-com-scrapy-e-python-3-pt
- [Crawler] - https://docs.scrapy.org/en/latest/topics/spiders.html#crawlspider-example
- [Crawler] - http://pythonclub.com.br/material-do-tutorial-web-scraping-na-nuvem.html
- [Crawler] - https://mikulskibartosz.name/how-to-use-scrapy-to-follow-links-on-the-scraped-pages-774fb2146047