Created
January 21, 2013 09:28
-
-
Save tlyng/4584834 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urlparse import urlparse | |
from twisted.internet import reactor | |
from scrapy.http import Request, HtmlResponse | |
from scrapy.crawler import Crawler | |
from scrapy.settings import Settings | |
from scrapy.spider import BaseSpider | |
from scrapy.selector import HtmlXPathSelector | |
from scrapy.item import Item, Field | |
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | |
from scrapy import log | |
class Page(Item): | |
url = Field() | |
title = Field() | |
size = Field() | |
referer = Field() | |
newcookies = Field() | |
class FollowAllSpider(BaseSpider): | |
name = 'followall' | |
def __init__(self, **kw): | |
super(FollowAllSpider, self).__init__(**kw) | |
url = kw.get('url') or kw.get('domain') or 'http://www.digipub.no' | |
if not url.startswith('http://') and not url.startswith('https://'): | |
url = 'https://%s/' % url | |
self.url = url | |
self.allowed_domains = [urlparse(url).hostname.lstrip('www.')] | |
self.link_extractor = SgmlLinkExtractor() | |
self.cookies_seen = set() | |
def start_requests(self): | |
return [Request(self.url, callback=self.parse), ] | |
def parse(self, response): | |
page = self._get_item(response) | |
r = [page, ] | |
r.extend(self._extract_requests(response)) | |
return r | |
def _get_item(self, response): | |
item = Page(url=response.url, size=str(len(response.body)), | |
referer=response.request.headers.get('Referer')) | |
self._set_title(item, response) | |
self._set_new_cookies(item, response) | |
if item['title']: | |
log.msg("Found page: %s" % (item['title'],), level=log.INFO) | |
else: | |
log.msg("Found url: %s" % (item['url'],), level=log.INFO) | |
return item | |
def _extract_requests(self, response): | |
r = [] | |
if isinstance(response, HtmlResponse): | |
links = self.link_extractor.extract_links(response) | |
r.extend(Request(x.url, callback=self.parse) for x in links) | |
return r | |
def _set_title(self, page, response): | |
if isinstance(response, HtmlResponse): | |
title = HtmlXPathSelector(response).select("//title/text()").extract() | |
if title: | |
page['title'] = title[0] | |
else: | |
page['title'] = u"" | |
def _set_new_cookies(self, page, response): | |
cookies = [] | |
for cookie in [x.split(';', 1)[0] for x in response.headers.getlist('Set-Cookie')]: | |
if cookie not in self.cookies_seen: | |
self.cookies_seen.add(cookie) | |
cookies.append(cookie) | |
if cookie: | |
page['newcookies'] = cookies | |
def main(): | |
spider = FollowAllSpider(domain='http://www.digipub.no') | |
spider2 = FollowAllSpider(domain='http://www.tmpnorge.no') | |
crawler = Crawler(Settings()) | |
crawler2 = Crawler(Settings()) | |
crawler.configure() | |
crawler2.configure() | |
crawler.crawl(spider) | |
crawler2.crawl(spider2) | |
crawler.start() | |
crawler2.start() | |
log.start() | |
reactor.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment