Skip to content

Instantly share code, notes, and snippets.

@santhoshtr
Created October 26, 2019 07:27
Show Gist options
  • Save santhoshtr/15407f8ab5f3c777deab06f5be94fc95 to your computer and use it in GitHub Desktop.
Save santhoshtr/15407f8ab5f3c777deab06f5be94fc95 to your computer and use it in GitHub Desktop.
Crawl Kerala PRD website and download all content to json
import scrapy
from scrapy.http import Request
class HeadlineCatcher(scrapy.Spider):
name = "headlinecatcher"
start_urls = ["http://www.prd.kerala.gov.in/pressrelease"]
custom_settings = {
'FEED_EXPORT_ENCODING': 'utf-8',
}
def parse(self, response):
self.logger.info('Visiting %s', response.url)
for news_link in response.css(".post-title a::attr(href)"):
yield response.follow(news_link, callback=self.parse_news_page)
for href in response.css('.pager__item a::attr(href)'):
yield response.follow(href, callback=self.parse)
def parse_news_page(self, response):
self.logger.info('Visiting %s', response.url)
title =response.css('h1 span::text').get()
content = response.css('.node__content p::text').get()
yield {'title':title, 'content':content}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment