erhangundogan/gist:5932474

## gistfile1.py
from httplib import BadStatusLine
import re
import threading
import urllib2
import Queue

def GetLinks(url):
    try:
        # agent info
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = { 'User-Agent' : user_agent }
        request = urllib2.Request(url, headers=headers)
        response = urllib2.urlopen(request)

        response_url = response.geturl()
        response_headers = response.info()
        content = response.read()

        if content:
            # relative url extraction needed
            links = set(re.findall("<a href=[\"|'](http.*?)[\"|'].*?>", content))
            print "\nThread: {0}\nRequest Addr: {1}\nResponse Addr: {2}\nTotal Link: {3}\nHeaders: {4}"\
            .format(threading.currentThread().getName(), url, response_url, len(links), response_headers)
            for link in links:
                linklower = link.lower()
                if linklower not in s:
                    print "\t" + linklower
                    s.add(linklower)
                    q.put(linklower)
    except (urllib2.HTTPError, urllib2.URLError, BadStatusLine): pass

def worker():
    while 1:
        address = q.get()
        try: GetLinks(address)
        except TypeError: pass
        q.task_done()

global s
s = set()
address = raw_input("Starting address to crawl:")
s.add(address)
q = Queue.Queue()
q.put(address)
for i in range(10):
    t = threading.Thread(target=worker)
    t.daemon = True
    t.start()

q.join()
	from httplib import BadStatusLine
	import re
	import threading
	import urllib2
	import Queue

	def GetLinks(url):
	try:
	# agent info
	user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
	headers = { 'User-Agent' : user_agent }
	request = urllib2.Request(url, headers=headers)
	response = urllib2.urlopen(request)

	response_url = response.geturl()
	response_headers = response.info()
	content = response.read()

	if content:
	# relative url extraction needed
	links = set(re.findall("<a href=[\"\|'](http.?)[\"\|'].?>", content))
	print "\nThread: {0}\nRequest Addr: {1}\nResponse Addr: {2}\nTotal Link: {3}\nHeaders: {4}"\
	.format(threading.currentThread().getName(), url, response_url, len(links), response_headers)
	for link in links:
	linklower = link.lower()
	if linklower not in s:
	print "\t" + linklower
	s.add(linklower)
	q.put(linklower)
	except (urllib2.HTTPError, urllib2.URLError, BadStatusLine): pass

	def worker():
	while 1:
	address = q.get()
	try: GetLinks(address)
	except TypeError: pass
	q.task_done()

	global s
	s = set()
	address = raw_input("Starting address to crawl:")
	s.add(address)
	q = Queue.Queue()
	q.put(address)
	for i in range(10):
	t = threading.Thread(target=worker)
	t.daemon = True
	t.start()

	q.join()