Skip to content

Instantly share code, notes, and snippets.

@adamstraube
Last active August 29, 2015 14:16
Show Gist options
  • Save adamstraube/7328a78431d8fcbddaa0 to your computer and use it in GitHub Desktop.
Save adamstraube/7328a78431d8fcbddaa0 to your computer and use it in GitHub Desktop.
Check .htaccess Redirect's and RedirectMatch's for broken links (404)
from bs4 import BeautifulSoup # For processing HTML
from bs4 import BeautifulStoneSoup # For processing XML
from mechanize import Browser
from urllib2 import HTTPError
counter = 0
bad_count = 0
# Hardcoded vars
base_url = "http://www.test.com"
dir_with_htaccess = "/var/www/html/.htaccess"
def scan_for_404(url):
br = Browser()
global bad_count
br.set_handle_equiv(True)
br.set_handle_robots(False)
try:
response = br.open(url)
print "Successful access: ", url
except HTTPError, e :
print "Error code: ", e.code, " on url: ", url
bad_count = bad_count + 1
# scan the line for the required pattern
def check_line(line):
line_split = line.split()
test_url = 0
global counter
# scan for Redirectmatch and process accordingly
try:
# Test if line was splitable
line_split[0]
if ('RedirectMatch' in line_split[0]) or ('Redirect' in line_split[0]):
if 'Permanent' in line_split[1]:
counter = counter + 2
redir_from = line_split[2]
redir_to = line_split[len(line_split)-1]
redir_from = ' '.join(redir_from.split())
redir_to = ' '.join(redir_to.split())
test_url = 1
if 'http' not in redir_to:
redir_to = base_url+redir_to
redir_from = base_url+redir_from
res_from = scan_for_404(redir_from)
res_to = scan_for_404(redir_to)
# print redir_from
# print redir_to
except IndexError:
pass
# open file to scan
with open(dir_with_htaccess, 'r') as htaccess:
for line in htaccess:
check_line(line)
print "-------------------------------"
print str(counter)+" web links checked"
print str(bad_count)+" web links failed with a 404 error"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment