Skip to content

Instantly share code, notes, and snippets.

@jumbophp
Forked from pawelmhm/gist:8917867
Created November 7, 2019 15:57
Show Gist options
  • Save jumbophp/797e505e37bced49ddbc511d1dd331e9 to your computer and use it in GitHub Desktop.
Save jumbophp/797e505e37bced49ddbc511d1dd331e9 to your computer and use it in GitHub Desktop.
Scrapy spider crawling Stack Overflow
from scrapy.spider import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from scrapy.item import Item, Field
import urllib
class Question(Item):
tags = Field()
answers = Field()
votes = Field()
date = Field()
link = Field()
class ArgSpider(CrawlSpider):
"""
Scrapes first 15 stackoverflow.com questions containing "query" within a given "tag" and
displays links, number of votes etc in the terminal.
Usage:
~: scrapy crawl StackSpider -a tag=[your tag] -a query=[your query]
For example
~: scrapy crawl StackSpider -a tag=python -a query="crawling a website"
"""
name = "StackSpider"
def __init__(self,tag=None,query=None,*args,**kwargs):
super(ArgSpider,self).__init__(*args,**kwargs)
self.start_urls = []
urlTemplate = "http://stackoverflow.com/search?q=%5B{tag}%5D{query}"
query = urllib.quote(query)
self.start_urls.append(urlTemplate.format(tag=tag,query=query))
def parse(self,response):
"""
@url http://stackoverflow.com/search?q=%5Bpython%5Dfiltering"
@returns items 15
@returns requests 0 1
@scrapes votes answers date link
"""
sel = Selector(response)
elems = sel.css('.question-summary')
results = []
for elem in elems:
item = Question()
item["tags"] = elem.css('.post-tag::text').extract()
item["votes"] = elem.css('.vote-count-post').xpath('.//strong/text()').extract()
item["answers"] = elem.css('.status').xpath('.//strong/text()').extract()
item["date"] = elem.css('.relativetime').xpath('.//@title').extract()
link = elem.css('.result-link').xpath('.//a/@href').extract()
item["link"] = link
results.append(item)
return results
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment