Skip to content

Instantly share code, notes, and snippets.

@antoine-lizee
Last active November 16, 2021 15:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save antoine-lizee/20ce98b912a2503d23aa4ad008278d6f to your computer and use it in GitHub Desktop.
Save antoine-lizee/20ce98b912a2503d23aa4ad008278d6f to your computer and use it in GitHub Desktop.
Fetch issues & comments from Topics
import json
import time
import requests
issues_url = 'https://api.github.com/repos/alan-eu/Topics/issues'
comments_url = 'https://api.github.com/repos/alan-eu/Topics/issues/comments'
rate_limit_url = 'https://api.github.com/rate_limit'
headers = {'Authorization': 'token <your_token>', 'Accept': 'application/vnd.github.v3+json'}
params = {
'direction': 'desc',
'per_page': 100,
'state': 'all',
'sort': 'updated',
}
def get_resource(url, params, max_page = 200, recover = False, n_max_retries=3, since=None):
page = 1
delta_page = 0
n_retries = 0
all_objects = []
last_timestamp = '<not_set>'
if since:
params['since'] = since
if recover:
params['direction'] = 'asc'
while page < max_page:
t0 = time.time()
params['page'] = page - delta_page
resp = requests.get(url, headers=headers, params=params)
# resp.headers['Link'] # get the next page and the last.
print("Request %i, fetching %d objects in %.3f sec: " % (page, params['per_page'], time.time() - t0), end='')
if 'since' in params:
print('[since: %s page: %i] ' % (params['since'], params['page']), end='')
if resp.status_code in (500, 502): # Server error, retry
print('ERROR: %s' % resp.text)
if n_retries >= n_max_retries:
n_retries = 0
page += 1
else:
n_retries += 1
continue
if resp.status_code == 422: # Maxing out the pagination
if recover:
print('Attempting to recover')
delta_page = page
params['since'] = last_timestamp
continue
else:
print('Over-paged, bailing-out with PARTIAL data')
break
if resp.text == '[]': # End of the loop
print("No more results, stopping")
break
page += 1
all_objects += resp.json()
last_timestamp = all_objects[-1]['updated_at']
print("last timestamp %s" % last_timestamp)
return all_objects
def write_json(objects, object_name):
n_objects = len(objects)
tmin = min(c['updated_at'] for c in objects)
tmax = max(c['updated_at'] for c in objects)
filename = f"{object_name}_from_rest_{tmin}_{tmax}_{n_objects}.json"
with open(filename, 'w') as f:
json.dump(objects, f)
print(f"{n_objects} elements written in json file for {object_name}.")
## Run thedownload ----
since_time = '2021-09-15T00:00:00Z'
all_issues = get_resource(issues_url, params, 200)
# Since specific time: all_issues = get_resource(issues_url, params, 20, since=since_time)
write_json(all_issues, 'issues')
all_comments = get_resource(comments_url, params, 2000, recover=True)
# Since specific time: all_comments = get_resource(comments_url, params, 100, since=since_time)
write_json(all_comments, 'comments')
@LeonardoNatale
Copy link

LeonardoNatale commented Nov 16, 2021

I don't know if it's useful at this point but I think you forgot to reset n_retries = 0 around line 58. If I understand the intended behaviour well.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment