Skip to content

Instantly share code, notes, and snippets.

@pije76
Created April 5, 2014 08:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pije76/9989045 to your computer and use it in GitHub Desktop.
Save pije76/9989045 to your computer and use it in GitHub Desktop.
import csv
from cStringIO import StringIO
from scrapy.http import Request
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy_tests.items import CrunchfeedItem
class Crunchfeed(BaseSpider):
name = "crunchfeed"
start_urls = ["file:///tmp/items.csv"]
def parse(self, response):
for row in csv.DictReader(StringIO(response.body)):
cookiejar = "%s" % hash(row["url"])
yield Request(row["url"], callback=self.parse_site, meta={"cookiejar": cookiejar})
yield Request(row["blog"], callback=self.parse_site, meta={"cookiejar": cookiejar})
def parse_site(self, response):
hxs = HtmlXPathSelector(response)
return CrunchfeedItem(
title=''.join(hxs.select("//title/text()").extract())
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment