Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
little thing to scan files for urls and get you a status about them
# for the love of bacon!
# by Ian Mariano @ianmariano
# Free and provided AS-IS with no warranty. No license required.
import os
import mimetypes
import re
import sys
import urllib
if len(sys.argv) < 2:
print "You must specify the start directory!"
# regex to detect .dir in path
igregex = re.compile(r'\/\.[^\/]*', re.IGNORECASE)
# simple URL match regex
urlregex = re.compile(r'((http[s]?:)?\/\/([a-z0-9]+(-[a-z0-9]+)*\.)+[a-z]{2,})', re.IGNORECASE)
# explicit extensions to ignore
igexts = set([
# explicit mime types to ignore
igmimes = set([
def should_ignore(r, f):
igs = igregex.findall(r)
if len(igs) > 0:
return True
if f.startswith('.'):
return True
if os.path.splitext(f)[1].lower() in igexts:
return True
m = mimetypes.guess_type(f)[0]
if m:
m = m.lower()
if (m.startswith('image/')
or m.startswith('application/vnd.')
or m.startswith('audio/')
or m.startswith('video/')):
return True
# simple case
if m in igmimes:
return True
print "{}: {}".format(f, m)
return False
start = sys.argv[1]
files = []
for root, dirs, filenames in os.walk(start):
for f in filenames:
if should_ignore(root, f):
files.append(os.path.join(root, f))
print "Scanning {} files...".format(len(files))
cache = [] # for already scanned
for file in files:
print file
with open(file, 'r') as f:
i = 1
for line in f:
urls = urlregex.findall(line)
if len(urls) > 0:
for u in urls:
url = u[0]
if url in cache:
print " {}: {} (cached)".format(i, url)
if url.startswith('//'):
url = 'http:' + url
code = urllib.urlopen(url).getcode()
print " {}: {} {}".format(i, url, code)
except IOError as e:
print " {}: {} ERROR {}".format(i, url, e.strerror)
i += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.