Skip to content

Instantly share code, notes, and snippets.

@nikhilpi-zz
Created April 2, 2015 01:46
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save nikhilpi-zz/c5a23a1f6cd920a95527 to your computer and use it in GitHub Desktop.
Save nikhilpi-zz/c5a23a1f6cd920a95527 to your computer and use it in GitHub Desktop.
Scrapy Scraper for Buzzfeed
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from buzzLinks.items import BuzzlinksItem
from urlparse import urlparse
# Spider
class DmozSpider(CrawlSpider):
name = "buzzfeedNews"
allowed_domains = ["buzzfeed.com"]
start_urls = [
"http://www.buzzfeed.com/"
]
page_count = 0
# Determines what pages the spider should crawl
rules = (
Rule(LxmlLinkExtractor(allow_domains=('buzzfeed.com') ), callback='parse_item'),
)
# Method to process each page
def parse_item(self, response):
self.crawler.stats.inc_value('pages_crawled')
items = []
depth = response.meta["depth"]
referring_url = response.request.headers.get('Referer', None)
current_url = response.url
title = response.xpath('//div[@id="buzz_header"]//h1/text()').extract()
for link in response.xpath('//div[@id="buzz_sub_buzz"]//div[not(contains(@class,"share-box"))]//a[not(@rel="nofollow")]/@href[not(contains(text(),"buzzfeed") or contains(text(),"buzzfed"))]'):
l = link.extract()
if str(l) != "javascript:;":
item = BuzzlinksItem()
item["depth"] = depth
item["current_url"] = current_url
item["referring_url"] = referring_url
item["link"] = link.extract()
item["article_title"] = title
parsed_uri = urlparse(link.extract())
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
item["link_domain"] = domain
item["page_id"] = self.crawler.stats.get_value('pages_crawled')
items.append(item)
return items
import scrapy
from scrapy.item import Item, Field
# Item
class BuzzlinksItem(scrapy.Item):
article_title = scrapy.Field()
link = scrapy.Field()
depth = scrapy.Field()
current_url = scrapy.Field()
referring_url = scrapy.Field()
link_domain = scrapy.Field()
page_id = scrapy.Field()
pass
from firebase import firebase
import json
# Pipeline
class BuzzlinksPipeline(object):
def process_item(self, item, spider):
d = {}
d['article_title'] = str(item['article_title'])
d['link'] = str(item['link'])
d['current_url'] = str(item['current_url'])
d['referring_url'] = str(item['referring_url'])
d['link_domain'] = str(item['link_domain'])
d['depth'] = item['depth']
fb = firebase.FirebaseApplication('https://torid-fire-7900.firebaseio.com', None)
result = fb.post('/buzzfeedLinks', d)
return item
BOT_NAME = 'buzzLinks'
SPIDER_MODULES = ['buzzLinks.spiders']
NEWSPIDER_MODULE = 'buzzLinks.spiders'
ITEM_PIPELINES = {'buzzLinks.pipelines.BuzzlinksPipeline':1}
EXTENSIONS = {
'scrapy.contrib.corestats.CoreStats': 500,
'scrapy.webservice.WebService': 500,
'scrapy.telnet.TelnetConsole': 500,
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment