Skip to content

Instantly share code, notes, and snippets.

@jorgeas80
Forked from nikhilpi-zz/buzzfeedNews_spider.py
Created December 30, 2016 18:41
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jorgeas80/f97986a111667e539dedf703405033ac to your computer and use it in GitHub Desktop.
Save jorgeas80/f97986a111667e539dedf703405033ac to your computer and use it in GitHub Desktop.
Scrapy Scraper for Buzzfeed
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor
from buzzLinks.items import BuzzlinksItem
from urlparse import urlparse
# Spider
class DmozSpider(CrawlSpider):
name = "buzzfeedNews"
allowed_domains = ["buzzfeed.com"]
start_urls = [
"http://www.buzzfeed.com/"
]
page_count = 0
# Determines what pages the spider should crawl
rules = (
Rule(LxmlLinkExtractor(allow_domains=('buzzfeed.com') ), callback='parse_item'),
)
# Method to process each page
def parse_item(self, response):
self.crawler.stats.inc_value('pages_crawled')
items = []
depth = response.meta["depth"]
referring_url = response.request.headers.get('Referer', None)
current_url = response.url
title = response.xpath('//div[@id="buzz_header"]//h1/text()').extract()
for link in response.xpath('//div[@id="buzz_sub_buzz"]//div[not(contains(@class,"share-box"))]//a[not(@rel="nofollow")]/@href[not(contains(text(),"buzzfeed") or contains(text(),"buzzfed"))]'):
l = link.extract()
if str(l) != "javascript:;":
item = BuzzlinksItem()
item["depth"] = depth
item["current_url"] = current_url
item["referring_url"] = referring_url
item["link"] = link.extract()
item["article_title"] = title
parsed_uri = urlparse(link.extract())
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
item["link_domain"] = domain
item["page_id"] = self.crawler.stats.get_value('pages_crawled')
items.append(item)
return items
import scrapy
from scrapy.item import Item, Field
# Item
class BuzzlinksItem(scrapy.Item):
article_title = scrapy.Field()
link = scrapy.Field()
depth = scrapy.Field()
current_url = scrapy.Field()
referring_url = scrapy.Field()
link_domain = scrapy.Field()
page_id = scrapy.Field()
pass
from firebase import firebase
import json
# Pipeline
class BuzzlinksPipeline(object):
def process_item(self, item, spider):
d = {}
d['article_title'] = str(item['article_title'])
d['link'] = str(item['link'])
d['current_url'] = str(item['current_url'])
d['referring_url'] = str(item['referring_url'])
d['link_domain'] = str(item['link_domain'])
d['depth'] = item['depth']
fb = firebase.FirebaseApplication('https://torid-fire-7900.firebaseio.com', None)
result = fb.post('/buzzfeedLinks', d)
return item
BOT_NAME = 'buzzLinks'
SPIDER_MODULES = ['buzzLinks.spiders']
NEWSPIDER_MODULE = 'buzzLinks.spiders'
ITEM_PIPELINES = {'buzzLinks.pipelines.BuzzlinksPipeline':1}
EXTENSIONS = {
'scrapy.contrib.corestats.CoreStats': 500,
'scrapy.webservice.WebService': 500,
'scrapy.telnet.TelnetConsole': 500,
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment