Created
June 16, 2013 16:34
-
-
Save homm/5792571 to your computer and use it in GitHub Desktop.
Python thread test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import thread | |
import urllib2 | |
from yurl import URL | |
from bs4 import BeautifulSoup | |
class Spider(object): | |
def __init__(self, base_url, max_threads=50): | |
self.base_url = base_url | |
self.max_threads = max_threads | |
self.current_threads = 0 | |
self.tasks = [base_url] | |
self.visited = set([base_url]) | |
self._lock = thread.allocate_lock() | |
def run(self): | |
while True: | |
print('>>> STATS now: {} todo: {} done: {}'.format( | |
self.current_threads, len(self.tasks), | |
len(self.visited) - len(self.tasks) - self.current_threads)) | |
while self.max_threads > self.current_threads: | |
if not len(self.tasks): | |
break | |
self._lock.acquire() | |
task = self.tasks.pop(0) | |
self.current_threads += 1 | |
self._lock.release() | |
thread.start_new_thread(self.new_thread, (task,)) | |
if not self.current_threads and not self.tasks: | |
break | |
time.sleep(0.5) | |
print "All done." | |
def add_task(self, task): | |
self._lock.acquire() | |
if task not in self.visited: | |
self.visited.add(task) | |
self.tasks.append(task) | |
self._lock.release() | |
def new_thread(self, task): | |
try: | |
start = time.time() | |
print("Task {} running.".format(task)) | |
try: | |
page = urllib2.urlopen(task) | |
except urllib2.HTTPError as e: | |
print("Eception {}".format(e)) | |
return | |
page = BeautifulSoup(page.read()) | |
for a in page.find_all('a'): | |
link = URL(self.base_url) + URL(a.get('href')) | |
link = str(link.replace(fragment='')) | |
if link.startswith(self.base_url): | |
self.add_task(link) | |
print("Task {} done in {} sec.".format(task, time.time() - start)) | |
finally: | |
self._lock.acquire() | |
self.current_threads -= 1 | |
self._lock.release() | |
if __name__ == '__main__': | |
spider = Spider('http://site.com', 5) | |
spider.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment