Skip to content

Instantly share code, notes, and snippets.

@erhangundogan
Created July 5, 2013 06:50
Show Gist options
  • Save erhangundogan/5932474 to your computer and use it in GitHub Desktop.
Save erhangundogan/5932474 to your computer and use it in GitHub Desktop.
from httplib import BadStatusLine
import re
import threading
import urllib2
import Queue
def GetLinks(url):
try:
# agent info
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
request = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(request)
response_url = response.geturl()
response_headers = response.info()
content = response.read()
if content:
# relative url extraction needed
links = set(re.findall("<a href=[\"|'](http.*?)[\"|'].*?>", content))
print "\nThread: {0}\nRequest Addr: {1}\nResponse Addr: {2}\nTotal Link: {3}\nHeaders: {4}"\
.format(threading.currentThread().getName(), url, response_url, len(links), response_headers)
for link in links:
linklower = link.lower()
if linklower not in s:
print "\t" + linklower
s.add(linklower)
q.put(linklower)
except (urllib2.HTTPError, urllib2.URLError, BadStatusLine): pass
def worker():
while 1:
address = q.get()
try: GetLinks(address)
except TypeError: pass
q.task_done()
global s
s = set()
address = raw_input("Starting address to crawl:")
s.add(address)
q = Queue.Queue()
q.put(address)
for i in range(10):
t = threading.Thread(target=worker)
t.daemon = True
t.start()
q.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment