Skip to content

Instantly share code, notes, and snippets.

@alexteusz
Last active April 9, 2019 08:29
Show Gist options
  • Save alexteusz/dda6b57c71dbd2303a410763a1546906 to your computer and use it in GitHub Desktop.
Save alexteusz/dda6b57c71dbd2303a410763a1546906 to your computer and use it in GitHub Desktop.
Python Scrapy Webcrawler
import scrapy
class NewsSpider(scrapy.Spider):
name = "news_de_DE"
start_urls = [
'http://www.spiegel.de/',
'https://www.tagesschau.de/',
'https://www1.wdr.de/'
]
def parse(self, response):
url = response.url
for data in response.css('html'):
if data.css('html::attr(lang)').get() == "de":
yield {
'url': url,
'meta': {
'language': data.css('html::attr(lang)').get(),
'keywords': data.css('meta[name*=eywords]::attr(content)').get(),
'author': data.css("meta[name*=uthor]::attr(content)").get(),
'publisher': data.css("meta[name*=ublisher]::attr(content)").get(),
'desc': data.css("meta[name*=escription]::attr(content)").get(),
'date': data.css("meta[name*=ate]::attr(content)").get()
},
'title': data.css('title::text').get(),
'abstract': data.css('strong::text').get(),
'text': preprocessedText
}
else:
continue
for a in response.css('a::attr(href)'):
yield response.follow(a, callback=self.parse)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment