Last active
February 20, 2016 23:23
-
-
Save sirwart/10cb713e88e2873d86fc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import httplib | |
import re | |
import threadingq | |
from urlparse import urlsplit | |
class ConcurrentWorker(object): | |
def __init__(self, num_threads, first_task): | |
self.task_queue = [first_task] | |
self.lock = threading.Condition() | |
self.running_count = 0 | |
for i in range(num_threads): | |
thread = threading.Thread(target=self.thread_loop) | |
thread.start() | |
self.lock.acquire() | |
while True: | |
if self.is_done(): | |
self.lock.release() | |
break | |
else: | |
self.lock.wait() | |
def is_done(self): | |
return len(self.task_queue) == 0 and self.running_count == 0 | |
def thread_loop(self): | |
while True: | |
self.lock.acquire() | |
if self.is_done(): | |
self.lock.release() | |
break | |
elif len(self.task_queue) == 0: | |
self.lock.wait() | |
else: | |
task = self.task_queue.pop(0) | |
self.running_count += 1 | |
self.lock.release() | |
additional_tasks = task() | |
self.lock.acquire() | |
self.running_count -= 1 | |
self.task_queue.extend(additional_tasks) | |
self.lock.notifyAll() | |
self.lock.release() | |
link_regex = re.compile(ur'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))') | |
def find_links(url): | |
url_obj = urlsplit(url) | |
conn = httplib.HTTPConnection(url_obj.netloc) | |
conn.request('GET', url_obj.path) | |
response = conn.getresponse() | |
headers = response.getheaders() | |
for (header_name, header_value) in headers: | |
if header_name == 'content-type': | |
if not header_value.startswith('text/html'): | |
return [] | |
else: | |
break | |
contents = response.read() | |
links_found = [] | |
for match in link_regex.finditer(contents): | |
found_url = match.group() | |
found_url_obj = urlsplit(found_url) | |
if found_url_obj.netloc == url_obj.netloc: | |
links_found.append(found_url) | |
return links_found | |
links_found = set() | |
links_found_lock = threading.Lock() | |
def create_find_links_task(url): | |
def task(): | |
print 'Finding links in', url, 'on thread', threading.current_thread() | |
res = find_links(url) | |
continue_searching = True | |
links_found_lock.acquire() | |
task_res = [] | |
for found_url in res: | |
if not links_found.issuperset([found_url]): | |
links_found.update([found_url]) | |
new_task = create_find_links_task(found_url) | |
task_res.append(new_task) | |
if len(links_found) > 50: | |
continue_searching = False | |
links_found_lock.release() | |
if continue_searching: | |
return task_res | |
else: return [] | |
return task | |
if __name__ == '__main__': | |
root_url = 'http://justin.harmonize.fm' | |
links_found.update([root_url]) | |
first_task = create_find_links_task(root_url) | |
ConcurrentWorker(2, first_task) | |
print 'links_found=', links_found |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment