Skip to content

Instantly share code, notes, and snippets.

@MostAwesomeDude
Created January 21, 2017 19:36
Show Gist options
  • Save MostAwesomeDude/d0a0b787956f3878afdb77fc19b6614e to your computer and use it in GitHub Desktop.
Save MostAwesomeDude/d0a0b787956f3878afdb77fc19b6614e to your computer and use it in GitHub Desktop.
#!/usr/bin/env nix-shell
#! nix-shell -i bash -p python27Packages.scrapy jq
echo "https://$1/" > urls.txt
scrapy runspider siege.py -a domain=$1 -t json -o - | jq -r '.[] | .url' >> urls.txt
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
class SiegeSpider(scrapy.Spider):
name = "siege"
def __init__(self, domain, *args, **kwargs):
super(SiegeSpider, self).__init__(*args, **kwargs)
urlBase = 'https://%s/' % domain
self.le = LinkExtractor(allow=[urlBase])
self.allowed_domains = [domain]
self.start_urls = (urlBase,)
def parse(self, response):
links = self.le.extract_links(response)
for link in links:
url = link.url
yield {"url": url}
yield scrapy.Request(url, callback=self.parse)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment