santhoshtr/KeralaPRDHeadlinesCrawler.py

## KeralaPRDHeadlinesCrawler.py
import scrapy
from scrapy.http import Request


class HeadlineCatcher(scrapy.Spider):
    name = "headlinecatcher"
    start_urls = ["http://www.prd.kerala.gov.in/pressrelease"]
    custom_settings = {
        'FEED_EXPORT_ENCODING': 'utf-8',
    }

    def parse(self, response):
        self.logger.info('Visiting %s', response.url)
        for news_link in response.css(".post-title a::attr(href)"):
            yield response.follow(news_link, callback=self.parse_news_page)
        for href in response.css('.pager__item a::attr(href)'):
            yield response.follow(href, callback=self.parse)

    def parse_news_page(self, response):
        self.logger.info('Visiting %s', response.url)
        title =response.css('h1 span::text').get()
        content = response.css('.node__content p::text').get()
        yield {'title':title, 'content':content}
	import scrapy
	from scrapy.http import Request


	class HeadlineCatcher(scrapy.Spider):
	name = "headlinecatcher"
	start_urls = ["http://www.prd.kerala.gov.in/pressrelease"]
	custom_settings = {
	'FEED_EXPORT_ENCODING': 'utf-8',
	}

	def parse(self, response):
	self.logger.info('Visiting %s', response.url)
	for news_link in response.css(".post-title a::attr(href)"):
	yield response.follow(news_link, callback=self.parse_news_page)
	for href in response.css('.pager__item a::attr(href)'):
	yield response.follow(href, callback=self.parse)

	def parse_news_page(self, response):
	self.logger.info('Visiting %s', response.url)
	title =response.css('h1 span::text').get()
	content = response.css('.node__content p::text').get()
	yield {'title':title, 'content':content}