Last active
August 29, 2015 14:16
-
-
Save adamstraube/7328a78431d8fcbddaa0 to your computer and use it in GitHub Desktop.
Check .htaccess Redirect's and RedirectMatch's for broken links (404)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup # For processing HTML | |
from bs4 import BeautifulStoneSoup # For processing XML | |
from mechanize import Browser | |
from urllib2 import HTTPError | |
counter = 0 | |
bad_count = 0 | |
# Hardcoded vars | |
base_url = "http://www.test.com" | |
dir_with_htaccess = "/var/www/html/.htaccess" | |
def scan_for_404(url): | |
br = Browser() | |
global bad_count | |
br.set_handle_equiv(True) | |
br.set_handle_robots(False) | |
try: | |
response = br.open(url) | |
print "Successful access: ", url | |
except HTTPError, e : | |
print "Error code: ", e.code, " on url: ", url | |
bad_count = bad_count + 1 | |
# scan the line for the required pattern | |
def check_line(line): | |
line_split = line.split() | |
test_url = 0 | |
global counter | |
# scan for Redirectmatch and process accordingly | |
try: | |
# Test if line was splitable | |
line_split[0] | |
if ('RedirectMatch' in line_split[0]) or ('Redirect' in line_split[0]): | |
if 'Permanent' in line_split[1]: | |
counter = counter + 2 | |
redir_from = line_split[2] | |
redir_to = line_split[len(line_split)-1] | |
redir_from = ' '.join(redir_from.split()) | |
redir_to = ' '.join(redir_to.split()) | |
test_url = 1 | |
if 'http' not in redir_to: | |
redir_to = base_url+redir_to | |
redir_from = base_url+redir_from | |
res_from = scan_for_404(redir_from) | |
res_to = scan_for_404(redir_to) | |
# print redir_from | |
# print redir_to | |
except IndexError: | |
pass | |
# open file to scan | |
with open(dir_with_htaccess, 'r') as htaccess: | |
for line in htaccess: | |
check_line(line) | |
print "-------------------------------" | |
print str(counter)+" web links checked" | |
print str(bad_count)+" web links failed with a 404 error" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment