Skip to content

Instantly share code, notes, and snippets.

@sirwart
Last active February 20, 2016 23:23
Show Gist options
  • Save sirwart/10cb713e88e2873d86fc to your computer and use it in GitHub Desktop.
Save sirwart/10cb713e88e2873d86fc to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import httplib
import re
import threadingq
from urlparse import urlsplit
class ConcurrentWorker(object):
def __init__(self, num_threads, first_task):
self.task_queue = [first_task]
self.lock = threading.Condition()
self.running_count = 0
for i in range(num_threads):
thread = threading.Thread(target=self.thread_loop)
thread.start()
self.lock.acquire()
while True:
if self.is_done():
self.lock.release()
break
else:
self.lock.wait()
def is_done(self):
return len(self.task_queue) == 0 and self.running_count == 0
def thread_loop(self):
while True:
self.lock.acquire()
if self.is_done():
self.lock.release()
break
elif len(self.task_queue) == 0:
self.lock.wait()
else:
task = self.task_queue.pop(0)
self.running_count += 1
self.lock.release()
additional_tasks = task()
self.lock.acquire()
self.running_count -= 1
self.task_queue.extend(additional_tasks)
self.lock.notifyAll()
self.lock.release()
link_regex = re.compile(ur'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))')
def find_links(url):
url_obj = urlsplit(url)
conn = httplib.HTTPConnection(url_obj.netloc)
conn.request('GET', url_obj.path)
response = conn.getresponse()
headers = response.getheaders()
for (header_name, header_value) in headers:
if header_name == 'content-type':
if not header_value.startswith('text/html'):
return []
else:
break
contents = response.read()
links_found = []
for match in link_regex.finditer(contents):
found_url = match.group()
found_url_obj = urlsplit(found_url)
if found_url_obj.netloc == url_obj.netloc:
links_found.append(found_url)
return links_found
links_found = set()
links_found_lock = threading.Lock()
def create_find_links_task(url):
def task():
print 'Finding links in', url, 'on thread', threading.current_thread()
res = find_links(url)
continue_searching = True
links_found_lock.acquire()
task_res = []
for found_url in res:
if not links_found.issuperset([found_url]):
links_found.update([found_url])
new_task = create_find_links_task(found_url)
task_res.append(new_task)
if len(links_found) > 50:
continue_searching = False
links_found_lock.release()
if continue_searching:
return task_res
else: return []
return task
if __name__ == '__main__':
root_url = 'http://justin.harmonize.fm'
links_found.update([root_url])
first_task = create_find_links_task(root_url)
ConcurrentWorker(2, first_task)
print 'links_found=', links_found
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment