Skip to content

Instantly share code, notes, and snippets.

@pawelmhm
Last active January 23, 2024 15:03
Show Gist options
  • Star 7 You must be signed in to star a gist
  • Fork 9 You must be signed in to fork a gist
  • Save pawelmhm/8917867 to your computer and use it in GitHub Desktop.
Save pawelmhm/8917867 to your computer and use it in GitHub Desktop.
Scrapy spider crawling Stack Overflow
from scrapy.spider import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.item import Item, Field
import urllib
class Question(Item):
tags = Field()
answers = Field()
votes = Field()
date = Field()
link = Field()
class ArgSpider(CrawlSpider):
"""
Scrapes first 15 stackoverflow.com questions containing "query" within a given "tag" and
displays links, number of votes etc in the terminal.
Usage:
~: scrapy crawl StackSpider -a tag=[your tag] -a query=[your query]
For example
~: scrapy crawl StackSpider -a tag=python -a query="crawling a website"
"""
name = "StackSpider"
def __init__(self,tag=None,query=None,*args,**kwargs):
super(ArgSpider,self).__init__(*args,**kwargs)
self.start_urls = []
urlTemplate = "http://stackoverflow.com/search?q=%5B{tag}%5D{query}"
query = urllib.quote(query)
self.start_urls.append(urlTemplate.format(tag=tag,query=query))
def parse(self,response):
"""
@url http://stackoverflow.com/search?q=%5Bpython%5Dfiltering"
@returns items 15
@returns requests 0 1
@scrapes votes answers date link
"""
sel = Selector(response)
elems = sel.css('.question-summary')
results = []
for elem in elems:
item = Question()
item["tags"] = elem.css('.post-tag::text').extract()
item["votes"] = elem.css('.vote-count-post').xpath('.//strong/text()').extract()
item["answers"] = elem.css('.status').xpath('.//strong/text()').extract()
item["date"] = elem.css('.relativetime').xpath('.//@title').extract()
link = elem.css('.result-link').xpath('.//a/@href').extract()
item["link"] = link
results.append(item)
return results
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment