jorgeas80/buzzfeedNews_spider.py

## buzzfeedNews_spider.py
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from buzzLinks.items import BuzzlinksItem
from urlparse import urlparse

# Spider
class DmozSpider(CrawlSpider):
  name = "buzzfeedNews"
  allowed_domains = ["buzzfeed.com"]
  start_urls = [
    "http://www.buzzfeed.com/"
  ]
  page_count = 0

  # Determines what pages the spider should crawl
  rules = (
        Rule(LxmlLinkExtractor(allow_domains=('buzzfeed.com') ), callback='parse_item'),
    )

  # Method to process each page
  def parse_item(self, response):
    self.crawler.stats.inc_value('pages_crawled')
    items = []
    depth = response.meta["depth"]
    referring_url = response.request.headers.get('Referer', None)
    current_url = response.url
    title = response.xpath('//div[@id="buzz_header"]//h1/text()').extract()
    for link in response.xpath('//div[@id="buzz_sub_buzz"]//div[not(contains(@class,"share-box"))]//a[not(@rel="nofollow")]/@href[not(contains(text(),"buzzfeed") or contains(text(),"buzzfed"))]'):
      l = link.extract()
      if str(l) != "javascript:;":
        item = BuzzlinksItem()
        item["depth"] = depth
        item["current_url"] = current_url
        item["referring_url"] = referring_url
        item["link"] = link.extract()
        item["article_title"] = title
        parsed_uri = urlparse(link.extract())
        domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
        item["link_domain"] = domain
        item["page_id"] = self.crawler.stats.get_value('pages_crawled')

        items.append(item)
    return items


## items.py
import scrapy
from scrapy.item import Item, Field

# Item
class BuzzlinksItem(scrapy.Item):
  article_title = scrapy.Field()
  link = scrapy.Field()
  depth = scrapy.Field()
  current_url = scrapy.Field()
  referring_url = scrapy.Field()
  link_domain = scrapy.Field()
  page_id = scrapy.Field()
  pass

## pipelines.py
from firebase import firebase
import json

# Pipeline
class BuzzlinksPipeline(object):
  def process_item(self, item, spider):
    d = {}
    d['article_title'] = str(item['article_title'])
    d['link'] = str(item['link'])
    d['current_url'] = str(item['current_url'])
    d['referring_url'] = str(item['referring_url'])
    d['link_domain'] = str(item['link_domain'])
    d['depth'] = item['depth']
    fb = firebase.FirebaseApplication('https://torid-fire-7900.firebaseio.com', None)
    result = fb.post('/buzzfeedLinks', d)
    return item

## settings.py
BOT_NAME = 'buzzLinks'

SPIDER_MODULES = ['buzzLinks.spiders']
NEWSPIDER_MODULE = 'buzzLinks.spiders'
ITEM_PIPELINES = {'buzzLinks.pipelines.BuzzlinksPipeline':1}

EXTENSIONS = {
    'scrapy.contrib.corestats.CoreStats': 500,
    'scrapy.webservice.WebService': 500,
    'scrapy.telnet.TelnetConsole': 500,
}
	import scrapy
	from scrapy.contrib.spiders import CrawlSpider, Rule
	from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
	from buzzLinks.items import BuzzlinksItem
	from urlparse import urlparse

	# Spider
	class DmozSpider(CrawlSpider):
	name = "buzzfeedNews"
	allowed_domains = ["buzzfeed.com"]
	start_urls = [
	"http://www.buzzfeed.com/"
	]
	page_count = 0

	# Determines what pages the spider should crawl
	rules = (
	Rule(LxmlLinkExtractor(allow_domains=('buzzfeed.com') ), callback='parse_item'),
	)

	# Method to process each page
	def parse_item(self, response):
	self.crawler.stats.inc_value('pages_crawled')
	items = []
	depth = response.meta["depth"]
	referring_url = response.request.headers.get('Referer', None)
	current_url = response.url
	title = response.xpath('//div[@id="buzz_header"]//h1/text()').extract()
	for link in response.xpath('//div[@id="buzz_sub_buzz"]//div[not(contains(@class,"share-box"))]//a[not(@rel="nofollow")]/@href[not(contains(text(),"buzzfeed") or contains(text(),"buzzfed"))]'):
	l = link.extract()
	if str(l) != "javascript:;":
	item = BuzzlinksItem()
	item["depth"] = depth
	item["current_url"] = current_url
	item["referring_url"] = referring_url
	item["link"] = link.extract()
	item["article_title"] = title
	parsed_uri = urlparse(link.extract())
	domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
	item["link_domain"] = domain
	item["page_id"] = self.crawler.stats.get_value('pages_crawled')

	items.append(item)
	return items
	import scrapy
	from scrapy.item import Item, Field

	# Item
	class BuzzlinksItem(scrapy.Item):
	article_title = scrapy.Field()
	link = scrapy.Field()
	depth = scrapy.Field()
	current_url = scrapy.Field()
	referring_url = scrapy.Field()
	link_domain = scrapy.Field()
	page_id = scrapy.Field()
	pass
	from firebase import firebase
	import json

	# Pipeline
	class BuzzlinksPipeline(object):
	def process_item(self, item, spider):
	d = {}
	d['article_title'] = str(item['article_title'])
	d['link'] = str(item['link'])
	d['current_url'] = str(item['current_url'])
	d['referring_url'] = str(item['referring_url'])
	d['link_domain'] = str(item['link_domain'])
	d['depth'] = item['depth']
	fb = firebase.FirebaseApplication('https://torid-fire-7900.firebaseio.com', None)
	result = fb.post('/buzzfeedLinks', d)
	return item
	BOT_NAME = 'buzzLinks'

	SPIDER_MODULES = ['buzzLinks.spiders']
	NEWSPIDER_MODULE = 'buzzLinks.spiders'
	ITEM_PIPELINES = {'buzzLinks.pipelines.BuzzlinksPipeline':1}

	EXTENSIONS = {
	'scrapy.contrib.corestats.CoreStats': 500,
	'scrapy.webservice.WebService': 500,
	'scrapy.telnet.TelnetConsole': 500,
	}