Skip to content

Instantly share code, notes, and snippets.

@nst
Created March 5, 2014 14:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nst/9368839 to your computer and use it in GitHub Desktop.
Save nst/9368839 to your computer and use it in GitHub Desktop.
Test dead links from a web page list. Written to help someone on a newsgroup.
#!/usr/bin/python
__version__ = "$Revision: 0.2 $"
__author__ = "Nicolas Seriot"
__date__ = "2005-07-20"
""""
Test dead links from a web page list.
Written to help someone on a newsgroup.
$ cat links.txt
http://seriot.ch/index.php
http://apple.com
$ python checkurl.py links.txt
http://seriot.ch/index.php
http://www.hjkbkhjdsf.com/ <- dead link
http://apple.com
""""
import sys
from sets import *
from urllib2 import *
from re import *
def urls(base_url, html):
regexp = compile("<a href=\"([^#].*?)\"", DOTALL)
f = findall(regexp, html)
set = Set()
for u in f:
if not (u.startswith('http') or u.startswith('mailto')):
if u.startswith('./'):
u = u.lstrip('.')
if base_url.endswith('.php') or base_url.endswith('.htm') or base_url.endswith('.html'):
components = base_url.split('/')
base_url = '/'.join(components[:-1])
elif not base_url.endswith('/'):
base_url = base_url + '/'
if base_url.endswith('/') and u.startswith('/'):
u = u.lstrip('/')
u = base_url + u
set.add(u)
return set
if len(sys.argv) < 2:
print "USAGE:", sys.argv[0], "file.txt"
sys.exit(1)
if __name__ == "__main__":
f = open(sys.argv[1])
for base_url in f.xreadlines():
if not base_url:
continue
base_url = base_url.strip()
print base_url
try:
html = urlopen(base_url).read()
except:
continue
for url in urls(base_url, html):
try:
req = Request(url)
req.add_header("Referer", base_url)
response = urlopen(req)
except:
print " ", url
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment