Skip to content

Instantly share code, notes, and snippets.



Forked from mdamien/
Created Jul 7, 2017
What would you like to do?
404 link detector with scrapy

List all the broken links on your website


python3 and scrapy (pip install scrapy)


  • scrapy runspider -o items.csv -a site=""
  • python3
import scrapy
class BrokenLinksSpider(scrapy.Spider):
name = 'brokenlink-checker'
handle_httpstatus_list = [404, 500]
def __init__(self, site, *args, **kwargs):
super().__init__(*args, **kwargs)
self.start_urls = [site]
self.DOMAIN = site.split('//')[1]
def parse(self, response):
if response.status in (404, 500):
item = {}
item['url'] = response.url
item['prev_page'] = response.meta['prev_url']
item['prev_link_url'] = response.meta['prev_href']
item['prev_link_text'] = response.meta['prev_link_text']
item['status'] = response.status
yield item
if self.DOMAIN in response.url:
for link in response.css('a'):
href = link.xpath('@href').extract()
text = link.xpath('text()').extract()
if href: # maybe should show an error if no href
yield response.follow(link, self.parse, meta={
'prev_link_text': text,
'prev_href': href,
'prev_url': response.url,
import csv, itertools
items = csv.DictReader(open('items.csv'))
for page, links in itertools.groupby(items, lambda item: item['prev_page']):
if page:
print('PAGE:', page)
for line in links:
print(' LINK TEXT:', line['prev_link_text'])
print(' LINK URL:', line['prev_link_url'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment