Skip to content

Instantly share code, notes, and snippets.

@eliasdorneles
Created June 23, 2014 02:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eliasdorneles/25267ccf50fba1462b0d to your computer and use it in GitHub Desktop.
Save eliasdorneles/25267ccf50fba1462b0d to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import scrapy
import parslepy
from collections import defaultdict
import StringIO
from scrapy.contrib.loader.processor import TakeFirst
class AnyItem(scrapy.item.Item):
fields = defaultdict(lambda: scrapy.item.Field())
class ParsleyItemsLoader(object):
def __init__(self, parselet, itemkey, response):
self.parselet = parselet
self.itemkey = itemkey
self.response = response
def iteritems(self):
self.extracted = self.parselet.parse(StringIO.StringIO(self.response.body))
for item_value in self.extracted.get(self.itemkey):
loader = scrapy.contrib.loader.ItemLoader(AnyItem())
loader.default_output_processor = TakeFirst()
loader.add_value(None, item_value)
yield loader.load_item()
class StackoverflowSpider(scrapy.spider.Spider):
name = 'SO'
start_urls = ['http://stackoverflow.com']
def __init__(self):
rules = {
'questions(#question-mini-list .question-summary)': [{
'title': ".//h3/a/text()"
}]
}
self.parselet = parslepy.Parselet(rules)
def parse(self, response):
loader = ParsleyItemsLoader(self.parselet, "questions", response)
return loader.iteritems()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment