Skip to content

Instantly share code, notes, and snippets.

@dangra
Created December 26, 2011 01:57
Show Gist options
  • Save dangra/1520383 to your computer and use it in GitHub Desktop.
Save dangra/1520383 to your computer and use it in GitHub Desktop.
scrapy's pydaybot at scraperwiki
import os, sys
import scraperwiki
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.loader import XPathItemLoader
from scrapy.contrib.loader.processor import MapCompose, Join
from w3lib.html import remove_tags, unquote_markup
from scrapy.item import Item, Field
class SpeakerItem(Item):
name = Field()
description = Field()
image = Field()
class SpeakerLoader(XPathItemLoader):
default_item_class = SpeakerItem
default_input_processor = MapCompose(remove_tags, unquote_markup, unicode.strip)
default_output_processor = Join()
class SpeakersSpider(BaseSpider):
name = "speakers"
allowed_domains = ["eventioz.com"]
start_urls = (
'https://eventioz.com/events/python-day-uruguay-2011/speakers',
)
def parse(self, response):
hxs = HtmlXPathSelector(response=response)
for sel in hxs.select('//div[@class="speaker_container"]'):
il = SpeakerLoader(selector=sel)
il.add_xpath('name', './/div[@class="speaker_name"]/text()')
il.add_xpath('image', './/div[@class="speaker_image"]/img/@src')
il.add_xpath('description', './/div[@class="speaker_description"]/*')
yield il.load_item()
if __name__ == 'scraper':
# Save scraped item to sqlite
from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
def _item_scraped(item, spider):
scraperwiki.sqlite.save(['name'], data=dict(item), verbose=0)
dispatcher.connect(_item_scraped, signals.item_scraped)
# Run scrapy spider using `runspider` command
from scrapy.cmdline import execute
execute(['scrapy', 'runspider', 'script.py'])
@humoyun
Copy link

humoyun commented Mar 1, 2017

good job

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment