Skip to content

Instantly share code, notes, and snippets.

Last active August 29, 2015 14:23
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
What would you like to do?
get hacker news corpus
# coding: utf-8
import requests
import os.path
import time
def multiple_tries(func, times, timeout):
for cnt in xrange(1, times + 1):
return func()
except Exception, e:
if cnt != times:
print e
print "Run {} failed! Sleeping for {} seconds.".format(cnt, timeout)
# All the attempts to run the function were unsuccessful.
raise e
def get(url):
r = requests.get(
# headers={'User-Agent': USER_AGENT},
if r.status_code == 200:
return r
elif r.status_code == 401:
return r # maybe something else here?
print r.text
raise Exception("WTF, didn't get a 200, got a " + str(r.status_code))
start = 1
maxitem = int(get("").text)
print 'starting...'
for item in xrange(start, maxitem+1):
fname = 'data/{}'.format(str(item))
if not os.path.isfile(fname):
print 'trying to get item {}'.format(item)
func = lambda: get("{}.json".format(str(item)))
r = multiple_tries(func, 10, 60)
with open(fname, 'w') as f:
print 'got item {}... {} more to go...'.format(item, maxitem - item)
with open('maxitem'.format(str(item)), 'w') as f:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment