Skip to content

Instantly share code, notes, and snippets.

@dreyescat
Created October 23, 2014 21:36
Show Gist options
  • Save dreyescat/562520fceb1569d8389d to your computer and use it in GitHub Desktop.
Save dreyescat/562520fceb1569d8389d to your computer and use it in GitHub Desktop.
Sample scrapy hacker news using a CrawlSpider
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
class HackerNewsItem(scrapy.Item):
title = scrapy.Field()
comment = scrapy.Field()
class HackerNewsSpider(CrawlSpider):
name = 'hackernews'
allowed_domains = ['news.ycombinator.com']
start_urls = [
'https://news.ycombinator.com/'
]
rules = (
Rule(LinkExtractor(allow=('item.*', )), callback='parse_item'),
)
def parse_item(self, response):
item = HackerNewsItem()
item['title'] = response.xpath('//*[contains(@class, "title")]/a/text()').extract()
item['comment'] = response.xpath('(//*[contains(@class, "comment")])[1]/font/text()').extract()
return item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment