Skip to content

Instantly share code, notes, and snippets.

@jennynz
Created July 6, 2022 04:51
Show Gist options
  • Save jennynz/6b7e31c7a140ed4c33b1b5f395b8e635 to your computer and use it in GitHub Desktop.
Save jennynz/6b7e31c7a140ed4c33b1b5f395b8e635 to your computer and use it in GitHub Desktop.
Python script for getting GitHub data (e.g. PRs, comments, reviews) about a public repository via their GraphQL API, with pagination & rate limit handline
import requests
import boto3
import json
import os
from datetime import datetime, timedelta
# Example: getting data for vuejs/vue from the last 90 days
ORG_NAME = "vuejs"
REPO_NAME = "vue"
CREATED_DATE_START = datetime.now() - timedelta(days=30)
CREATED_DATE_END = datetime.now()
BUCKET_NAME = "my-cool-s3-bucket-query-results" # Add your output S3 bucket here
PERSONAL_ACCESS_TOKEN = "" # Add your token from GitHub here
HEADERS = {
"Authorization": f"bearer {PERSONAL_ACCESS_TOKEN}",
"Accept": "application/vnd.github.machine-man-preview+json",
}
# The types of queries you want to run for each PR
QUERIES = [
{
"name": "pull-requests",
"query_file": "pull-requests.graphql", # Paths to graphQL queries
"pagination_key": "comments",
},
{
"name": "pull-requests-reviews",
"query_file": "pull-requests-reviews.graphql",
"pagination_key": "reviews",
},
{
"name": "pull-requests-review-comments",
"query_file": "pull-requests-review-comments.graphql",
"pagination_key": "reviews",
},
]
# First get the PR numbers for the PRs in your desired date range, using the REST API.
all_prs_retrieved = False
page_num = 0
n_items_per_page = 100
pr_numbers = []
while not all_prs_retrieved:
# Get PR numbers with pagination via the REST API
# https://docs.github.com/en/rest/reference/pulls#list-pull-requests
response = requests.get(
f"https://api.github.com/repos/{ORG_NAME}/{REPO_NAME}/pulls",
params={
"owner": ORG_NAME,
"repo": REPO_NAME,
"sort": "created",
"direction": "desc",
"per_page": n_items_per_page,
"page": page_num,
"state": "all",
},
headers=HEADERS,
)
data = response.json()
# Get the created date of the last PR in the date-ordered response, and check if we've exceeded the date range
created_at = datetime.strptime(data[-1]["created_at"], "%Y-%m-%dT%H:%M:%SZ")
all_prs_retrieved = (
created_at < CREATED_DATE_START or len(data) < n_items_per_page
)
for pr in data:
# You can do some extra filtering here e.g. discard any PRs from the late page with created dates outside of specified date
pr_numbers.append(pr["number"])
page_num += 1
n_pr_numbers = len(pr_numbers)
# Then query the GraphQL API for data on each PR
s3_client = boto3.client("s3")
for query in QUERIES:
print(f"Running {query['name']} query")
output_dir = os.path.join(ORG_NAME, REPO_NAME, query["name"])
for i, pr_number in enumerate(pr_numbers):
print(f"PR number {pr_number} ({i} of {n_pr_numbers} PRs)")
next_cursor = "null"
items_left_to_paginate = True
page_num = 1
while items_left_to_paginate:
# Get the GraphQL query and populate it with variables for the given PR
with open(query["query_file"], "r") as q_file:
query_string = q_file.read()
formatted_query_string = (query_string) % (
ORG_NAME,
REPO_NAME,
pr_number,
next_cursor,
)
# Post the query
response = requests.post(
"https://api.github.com/graphql",
json={"query": formatted_query_string},
headers=HEADERS,
)
results = response.json()
data = results["data"]
# Check rate limit
if (
"errors" in results and "api limit" in results["errors"]["type"].lower()
) | (data["rateLimit"]["remaining"] <= 100):
# Rate limit has been reached, sleep until reset time
rate_limit_reset = datetime.strptime(
data["rateLimit"]["resetAt"], "%Y-%m-%dT%H:%M:%SZ"
) + timedelta(minutes=5)
# Write the query results to S3 using boto3
bucket_key = f"{output_dir}/PR{pr_number}_page{page_num}.json"
s3_client.put_object(
Body=json.dumps(data), Bucket=BUCKET_NAME, Key=bucket_key
)
# Update pagination cursor
current_level_item_key = data["repository"]["pullRequest"][
query["pagination_key"]
]
if current_level_item_key["pageInfo"]["hasNextPage"]:
next_cursor = current_level_item_key["pageInfo"]["endCursor"]
page_num += 1
continue
items_left_to_paginate = False
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment