Skip to content

Instantly share code, notes, and snippets.

@matagus
Forked from mdamien/0readme.md
Created July 7, 2017 15:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save matagus/6961f5afd6faa22ef13f685f8683ab6f to your computer and use it in GitHub Desktop.
Save matagus/6961f5afd6faa22ef13f685f8683ab6f to your computer and use it in GitHub Desktop.
404 link detector with scrapy

List all the broken links on your website

Requirements:

python3 and scrapy (pip install scrapy)

Usage

  • scrapy runspider -o items.csv -a site="https://yoursite.org" 1spider.py
  • python3 2format_results.py
import scrapy
class BrokenLinksSpider(scrapy.Spider):
name = 'brokenlink-checker'
handle_httpstatus_list = [404, 500]
def __init__(self, site, *args, **kwargs):
super().__init__(*args, **kwargs)
self.start_urls = [site]
self.DOMAIN = site.split('//')[1]
def parse(self, response):
if response.status in (404, 500):
item = {}
item['url'] = response.url
item['prev_page'] = response.meta['prev_url']
item['prev_link_url'] = response.meta['prev_href']
item['prev_link_text'] = response.meta['prev_link_text']
item['status'] = response.status
yield item
if self.DOMAIN in response.url:
for link in response.css('a'):
href = link.xpath('@href').extract()
text = link.xpath('text()').extract()
if href: # maybe should show an error if no href
yield response.follow(link, self.parse, meta={
'prev_link_text': text,
'prev_href': href,
'prev_url': response.url,
})
import csv, itertools
items = csv.DictReader(open('items.csv'))
for page, links in itertools.groupby(items, lambda item: item['prev_page']):
if page:
print('PAGE:', page)
for line in links:
print(' LINK TEXT:', line['prev_link_text'])
print(' LINK URL:', line['prev_link_url'])
print()
print()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment