Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wenting-zhao/e3ac88519dd09d2f0183b90c9c8778a4 to your computer and use it in GitHub Desktop.
Save wenting-zhao/e3ac88519dd09d2f0183b90c9c8778a4 to your computer and use it in GitHub Desktop.
Scrapy spider crawling Stack Overflow
from scrapy.spider import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.item import Item, Field
import urllib
class Question(Item):
tags = Field()
answers = Field()
votes = Field()
date = Field()
link = Field()
class ArgSpider(CrawlSpider):
"""
Scrapes first 15 stackoverflow.com questions containing "query" within a given "tag" and
displays links, number of votes etc in the terminal.
Usage:
~: scrapy crawl StackSpider -a tag=[your tag] -a query=[your query]
For example
~: scrapy crawl StackSpider -a tag=python -a query="crawling a website"
"""
name = "StackSpider"
def __init__(self,tag=None,query=None,*args,**kwargs):
super(ArgSpider,self).__init__(*args,**kwargs)
self.start_urls = []
urlTemplate = "http://stackoverflow.com/search?q=%5B{tag}%5D{query}"
query = urllib.quote(query)
self.start_urls.append(urlTemplate.format(tag=tag,query=query))
def parse(self,response):
"""
@url http://stackoverflow.com/search?q=%5Bpython%5Dfiltering"
@returns items 15
@returns requests 0 1
@scrapes votes answers date link
"""
sel = Selector(response)
elems = sel.css('.question-summary')
results = []
for elem in elems:
item = Question()
item["tags"] = elem.css('.post-tag::text').extract()
item["votes"] = elem.css('.vote-count-post').xpath('.//strong/text()').extract()
item["answers"] = elem.css('.status').xpath('.//strong/text()').extract()
item["date"] = elem.css('.relativetime').xpath('.//@title').extract()
link = elem.css('.result-link').xpath('.//a/@href').extract()
item["link"] = link
results.append(item)
return results
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment