raphapassini/bbcnews.py

## bbcnews.py
#!/usr/bin/python
# -*- coding: utf-8 -*-

import scrapy
from scrapy import Request
from scrapy.linkextractors import LinkExtractor
from keywords import ocean_keywords as keywords


class BBCSpider(scrapy.Spider):
    name = 'bbcnews'
    allowed_domains = ['bbc.co.uk']
    start_urls = [
        "http://www.bbc.co.uk",
    ]

    def parse(self, response):
        news_le = LinkExtractor('-\d+$')
        links = news_le.extract_links(response)
        for link in links:
            yield Request(link.url, callback=self.parse_story)

    def parse_story(self, response):
        # the article seems to be the news content always
        text = ''.join(response.css('article p::text').extract())

        # create a list of words that are found inside the text and make all
        # words lowercase
        all_words = set([t.lower() for t in text.split(' ')])
        tags = all_words.intersection(keywords)

        return {
            'url': response.url,
            'headline': response.xpath("//title/text()").extract_first(),
            'body': text,
            'tags': list(tags),
        }
	#!/usr/bin/python
	# -- coding: utf-8 --

	import scrapy
	from scrapy import Request
	from scrapy.linkextractors import LinkExtractor
	from keywords import ocean_keywords as keywords


	class BBCSpider(scrapy.Spider):
	name = 'bbcnews'
	allowed_domains = ['bbc.co.uk']
	start_urls = [
	"http://www.bbc.co.uk",
	]

	def parse(self, response):
	news_le = LinkExtractor('-\d+$')
	links = news_le.extract_links(response)
	for link in links:
	yield Request(link.url, callback=self.parse_story)

	def parse_story(self, response):
	# the article seems to be the news content always
	text = ''.join(response.css('article p::text').extract())

	# create a list of words that are found inside the text and make all
	# words lowercase
	all_words = set([t.lower() for t in text.split(' ')])
	tags = all_words.intersection(keywords)

	return {
	'url': response.url,
	'headline': response.xpath("//title/text()").extract_first(),
	'body': text,
	'tags': list(tags),
	}