Skip to content

Instantly share code, notes, and snippets.

@jjjake
Forked from mikemccabe/parallel_md_get.py
Created September 25, 2012 22:31
Show Gist options
  • Save jjjake/3784862 to your computer and use it in GitHub Desktop.
Save jjjake/3784862 to your computer and use it in GitHub Desktop.
Parallel archive.org metadata fetching using python and gevent
# This demonstrates doing multiple metadata fetches in parallel.
# It seems to be fast enough that the json decoding cost becomes
# a significant proportion of the execution time.
# It requires gevent; see http://www.gevent.org/intro.html#installation
# To make this do something useful, modify do_work().
import gevent
import httplib
import urllib
import json
from gevent import monkey
monkey.patch_all()
from gevent import socket
from gevent.pool import Pool
from gevent import queue as g_queue
input_queue = g_queue.JoinableQueue(1000)
def queue_ids(ids, start_index=0, count=0):
limit_index = start_index + count
for i, id in enumerate(ids):
if i < start_index:
continue
if limit_index != 0 and i >= limit_index:
break
id = id.strip()
input_queue.put((id, i))
def get_url(url):
f = urllib.urlopen(url)
c = f.read()
f.close()
return c
def worker():
while True:
id, i = input_queue.get()
try:
do_work(id, i)
finally:
input_queue.task_done()
def do_work(id, i):
j = get_url("http://archive.org/metadata/" + id)
o = json.loads(j)
# n.b.: o might be empty if the ID couldn't be fetched for whatever reason
print "%s %s %s %s %s" % (i, id, o.get('server', ''),
o.get('dir', ''), o.get('item_size', ''))
for i in range(20): # 20 seems like a reasonable number - don't go nuts!
gevent.spawn(worker)
queue_ids(open('ids.txt'), 0, 1000); # just do 1000 lines, starting with 1rst
input_queue.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment