Skip to content

Instantly share code, notes, and snippets.

@tlyng
Created January 21, 2013 09:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tlyng/4584834 to your computer and use it in GitHub Desktop.
Save tlyng/4584834 to your computer and use it in GitHub Desktop.
from urlparse import urlparse
from twisted.internet import reactor
from scrapy.http import Request, HtmlResponse
from scrapy.crawler import Crawler
from scrapy.settings import Settings
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy import log
class Page(Item):
url = Field()
title = Field()
size = Field()
referer = Field()
newcookies = Field()
class FollowAllSpider(BaseSpider):
name = 'followall'
def __init__(self, **kw):
super(FollowAllSpider, self).__init__(**kw)
url = kw.get('url') or kw.get('domain') or 'http://www.digipub.no'
if not url.startswith('http://') and not url.startswith('https://'):
url = 'https://%s/' % url
self.url = url
self.allowed_domains = [urlparse(url).hostname.lstrip('www.')]
self.link_extractor = SgmlLinkExtractor()
self.cookies_seen = set()
def start_requests(self):
return [Request(self.url, callback=self.parse), ]
def parse(self, response):
page = self._get_item(response)
r = [page, ]
r.extend(self._extract_requests(response))
return r
def _get_item(self, response):
item = Page(url=response.url, size=str(len(response.body)),
referer=response.request.headers.get('Referer'))
self._set_title(item, response)
self._set_new_cookies(item, response)
if item['title']:
log.msg("Found page: %s" % (item['title'],), level=log.INFO)
else:
log.msg("Found url: %s" % (item['url'],), level=log.INFO)
return item
def _extract_requests(self, response):
r = []
if isinstance(response, HtmlResponse):
links = self.link_extractor.extract_links(response)
r.extend(Request(x.url, callback=self.parse) for x in links)
return r
def _set_title(self, page, response):
if isinstance(response, HtmlResponse):
title = HtmlXPathSelector(response).select("//title/text()").extract()
if title:
page['title'] = title[0]
else:
page['title'] = u""
def _set_new_cookies(self, page, response):
cookies = []
for cookie in [x.split(';', 1)[0] for x in response.headers.getlist('Set-Cookie')]:
if cookie not in self.cookies_seen:
self.cookies_seen.add(cookie)
cookies.append(cookie)
if cookie:
page['newcookies'] = cookies
def main():
spider = FollowAllSpider(domain='http://www.digipub.no')
spider2 = FollowAllSpider(domain='http://www.tmpnorge.no')
crawler = Crawler(Settings())
crawler2 = Crawler(Settings())
crawler.configure()
crawler2.configure()
crawler.crawl(spider)
crawler2.crawl(spider2)
crawler.start()
crawler2.start()
log.start()
reactor.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment