Skip to content

Instantly share code, notes, and snippets.

@homm
Created June 16, 2013 16:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save homm/5792571 to your computer and use it in GitHub Desktop.
Save homm/5792571 to your computer and use it in GitHub Desktop.
Python thread test
import time
import thread
import urllib2
from yurl import URL
from bs4 import BeautifulSoup
class Spider(object):
def __init__(self, base_url, max_threads=50):
self.base_url = base_url
self.max_threads = max_threads
self.current_threads = 0
self.tasks = [base_url]
self.visited = set([base_url])
self._lock = thread.allocate_lock()
def run(self):
while True:
print('>>> STATS now: {} todo: {} done: {}'.format(
self.current_threads, len(self.tasks),
len(self.visited) - len(self.tasks) - self.current_threads))
while self.max_threads > self.current_threads:
if not len(self.tasks):
break
self._lock.acquire()
task = self.tasks.pop(0)
self.current_threads += 1
self._lock.release()
thread.start_new_thread(self.new_thread, (task,))
if not self.current_threads and not self.tasks:
break
time.sleep(0.5)
print "All done."
def add_task(self, task):
self._lock.acquire()
if task not in self.visited:
self.visited.add(task)
self.tasks.append(task)
self._lock.release()
def new_thread(self, task):
try:
start = time.time()
print("Task {} running.".format(task))
try:
page = urllib2.urlopen(task)
except urllib2.HTTPError as e:
print("Eception {}".format(e))
return
page = BeautifulSoup(page.read())
for a in page.find_all('a'):
link = URL(self.base_url) + URL(a.get('href'))
link = str(link.replace(fragment=''))
if link.startswith(self.base_url):
self.add_task(link)
print("Task {} done in {} sec.".format(task, time.time() - start))
finally:
self._lock.acquire()
self.current_threads -= 1
self._lock.release()
if __name__ == '__main__':
spider = Spider('http://site.com', 5)
spider.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment