-
-
Save jjjake/3784862 to your computer and use it in GitHub Desktop.
Parallel archive.org metadata fetching using python and gevent
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This demonstrates doing multiple metadata fetches in parallel. | |
# It seems to be fast enough that the json decoding cost becomes | |
# a significant proportion of the execution time. | |
# It requires gevent; see http://www.gevent.org/intro.html#installation | |
# To make this do something useful, modify do_work(). | |
import gevent | |
import httplib | |
import urllib | |
import json | |
from gevent import monkey | |
monkey.patch_all() | |
from gevent import socket | |
from gevent.pool import Pool | |
from gevent import queue as g_queue | |
input_queue = g_queue.JoinableQueue(1000) | |
def queue_ids(ids, start_index=0, count=0): | |
limit_index = start_index + count | |
for i, id in enumerate(ids): | |
if i < start_index: | |
continue | |
if limit_index != 0 and i >= limit_index: | |
break | |
id = id.strip() | |
input_queue.put((id, i)) | |
def get_url(url): | |
f = urllib.urlopen(url) | |
c = f.read() | |
f.close() | |
return c | |
def worker(): | |
while True: | |
id, i = input_queue.get() | |
try: | |
do_work(id, i) | |
finally: | |
input_queue.task_done() | |
def do_work(id, i): | |
j = get_url("http://archive.org/metadata/" + id) | |
o = json.loads(j) | |
# n.b.: o might be empty if the ID couldn't be fetched for whatever reason | |
print "%s %s %s %s %s" % (i, id, o.get('server', ''), | |
o.get('dir', ''), o.get('item_size', '')) | |
for i in range(20): # 20 seems like a reasonable number - don't go nuts! | |
gevent.spawn(worker) | |
queue_ids(open('ids.txt'), 0, 1000); # just do 1000 lines, starting with 1rst | |
input_queue.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment