Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
get hacker news corpus
# coding: utf-8
import requests
import os.path
import time
def multiple_tries(func, times, timeout):
for cnt in xrange(1, times + 1):
try:
return func()
except Exception, e:
pass
if cnt != times:
print e
print "Run {} failed! Sleeping for {} seconds.".format(cnt, timeout)
time.sleep(timeout)
# All the attempts to run the function were unsuccessful.
raise e
def get(url):
r = requests.get(
url,
timeout=60,
# headers={'User-Agent': USER_AGENT},
)
if r.status_code == 200:
return r
elif r.status_code == 401:
return r # maybe something else here?
else:
print r.text
raise Exception("WTF, didn't get a 200, got a " + str(r.status_code))
start = 1
maxitem = int(get("https://hacker-news.firebaseio.com/v0/maxitem.json").text)
print 'starting...'
for item in xrange(start, maxitem+1):
fname = 'data/{}'.format(str(item))
if not os.path.isfile(fname):
print 'trying to get item {}'.format(item)
func = lambda: get("https://hacker-news.firebaseio.com/v0/item/{}.json".format(str(item)))
r = multiple_tries(func, 10, 60)
with open(fname, 'w') as f:
f.write(r.text.encode('utf-8'))
print 'got item {}... {} more to go...'.format(item, maxitem - item)
with open('maxitem'.format(str(item)), 'w') as f:
f.write(str(maxitem))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment