Created
May 17, 2018 13:20
-
-
Save JBPressac/3efe40df51ddb1e05427dee3cbdc791f to your computer and use it in GitHub Desktop.
Spider d'extraction des premières pages du forum Python pour le tutoriel Scrapy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf8 | |
import scrapy | |
class DeveloppezSpyder(scrapy.Spider): | |
name = 'forum-python-developpez' | |
start_urls = ['https://www.developpez.net/forums/f1155/autres-langages/python-zope/general-python/'] | |
counter = 1 | |
def parse(self, response): | |
for fil_discussion in response.css('#threads .inner'): | |
yield { | |
'titre': fil_discussion.css('.threadtitle a::text').extract_first(), | |
'auteur': fil_discussion.css('a.username::text').extract_first(), | |
'date': fil_discussion.css('.author span::text').re(r'\d{2}/\d{2}/\d{4}|Hier|Aujourd\'hui') | |
} | |
next_page = response.css('[rel=next]::attr(href)').extract_first() | |
if next_page is not None and self.counter < 2: | |
self.counter += 1 | |
next_page = response.urljoin(next_page) | |
yield scrapy.Request(next_page, callback=self.parse) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment