jjjake/parallel_md_get.py

## parallel_md_get.py
# This demonstrates doing multiple metadata fetches in parallel.
# It seems to be fast enough that the json decoding cost becomes
# a significant proportion of the execution time.

# It requires gevent; see http://www.gevent.org/intro.html#installation

# To make this do something useful, modify do_work().

import gevent

import httplib
import urllib
import json

from gevent import monkey
monkey.patch_all()

from gevent import socket
from gevent.pool import Pool
from gevent import queue as g_queue

input_queue = g_queue.JoinableQueue(1000)

def queue_ids(ids, start_index=0, count=0):
    limit_index = start_index + count
    for i, id in enumerate(ids):
        if i < start_index:
            continue
        if limit_index != 0 and i >= limit_index:
            break
        id = id.strip()
        input_queue.put((id, i))


def get_url(url):
    f = urllib.urlopen(url)
    c = f.read()
    f.close()
    return c


def worker():
    while True:
        id, i = input_queue.get()
        try:
            do_work(id, i)
        finally:
            input_queue.task_done()


def do_work(id, i):
    j = get_url("http://archive.org/metadata/" + id)
    o = json.loads(j)
    # n.b.: o might be empty if the ID couldn't be fetched for whatever reason
    print "%s %s %s %s %s" % (i, id, o.get('server', ''),
                              o.get('dir', ''), o.get('item_size', ''))


for i in range(20): # 20 seems like a reasonable number - don't go nuts!
    gevent.spawn(worker)

queue_ids(open('ids.txt'), 0, 1000); # just do 1000 lines, starting with 1rst

input_queue.join()
	# This demonstrates doing multiple metadata fetches in parallel.
	# It seems to be fast enough that the json decoding cost becomes
	# a significant proportion of the execution time.

	# It requires gevent; see http://www.gevent.org/intro.html#installation

	# To make this do something useful, modify do_work().

	import gevent

	import httplib
	import urllib
	import json

	from gevent import monkey
	monkey.patch_all()

	from gevent import socket
	from gevent.pool import Pool
	from gevent import queue as g_queue

	input_queue = g_queue.JoinableQueue(1000)

	def queue_ids(ids, start_index=0, count=0):
	limit_index = start_index + count
	for i, id in enumerate(ids):
	if i < start_index:
	continue
	if limit_index != 0 and i >= limit_index:
	break
	id = id.strip()
	input_queue.put((id, i))


	def get_url(url):
	f = urllib.urlopen(url)
	c = f.read()
	f.close()
	return c


	def worker():
	while True:
	id, i = input_queue.get()
	try:
	do_work(id, i)
	finally:
	input_queue.task_done()


	def do_work(id, i):
	j = get_url("http://archive.org/metadata/" + id)
	o = json.loads(j)
	# n.b.: o might be empty if the ID couldn't be fetched for whatever reason
	print "%s %s %s %s %s" % (i, id, o.get('server', ''),
	o.get('dir', ''), o.get('item_size', ''))


	for i in range(20): # 20 seems like a reasonable number - don't go nuts!
	gevent.spawn(worker)

	queue_ids(open('ids.txt'), 0, 1000); # just do 1000 lines, starting with 1rst

	input_queue.join()