Skip to content

Instantly share code, notes, and snippets.

@indradhanush
Last active December 30, 2022 03:55
Show Gist options
  • Save indradhanush/9cf7c58e81b5f8a77c8270fa8fc3183f to your computer and use it in GitHub Desktop.
Save indradhanush/9cf7c58e81b5f8a77c8270fa8fc3183f to your computer and use it in GitHub Desktop.
A Python scipt to fetch all public and user affliiated repos with visibility: all

README

Requirements

  • python3.6
  • python-requests
  • GitHub Public Access Token

What this does

  • This will fetch all the affiliated repos and the public repos 100 items per request until there are no more.
  • It sleeps for 1 second between every 10 requests.
  • It reports progress on the terminal every 100 requests.
  • It writes the JSON of the repo received in the response to a file named faulty-repos under the current working directory if the script encounters anything unusual about the id or the full_name of the repository.
  • It may also write some debug logs to a file named debug-github-repos.log under the current working directory if there is an unexpected response from the API.

Running the script

usage: github_fetch_repos.py [-h] [--affiliated [AFFILIATED]]
                             [--public [PUBLIC]]
                             [--affiliated-page AFFILIATED_PAGE]
                             [--public-since PUBLIC_SINCE]
                             url token

positional arguments:
  url                   URL of the API, example: https://api.github.com/v3/
  token                 Secret token

options:
  -h, --help            show this help message and exit
  --affiliated [AFFILIATED]
                        Optionally fetch affiliated repos
  --public [PUBLIC]     Optionally fetch public repos
  --affiliated-page AFFILIATED_PAGE
                        From which page (100 per page) should we start
                        fetching affiliated repos?
  --public-since PUBLIC_SINCE
                        Since which repo should we start fetching?

For example to fetch both public and affiliated repos:

python3 github_fetch_repos.py https://api.github.com/v3/ super-secret-token

But to fetch only public repos:

 python3 github_fetch_repos.py https://api.github.com/v3/ super-secret-token --public=true --public=since=100

This will also fetch public repos starting from repo ID 100 instead of the beginning.

And to fetch only affiliated repos:

 python3 github_fetch_repos.py https://api.github.com/v3/ super-secret-token  --affiliated=true --affiliated-page=2

This will also fetch affiliated repos starting from page 2 instead of the beginning.

#!/usr/bin/env python3
import argparse
from os import path
import requests
import time
import json
def get_repos(path: str, headers: {}) -> (bool, int):
res = requests.get(path, headers=headers)
if res.status_code != 200:
raise Exception(f"""Request failed {res.status_code}, {res.content}""")
data = res.json()
for item in data:
if item is None:
write_debug_logs(res)
break
try:
id = item["id"]
full_name = item["full_name"]
except KeyError:
write_item_to_file(f"Either id or full_name does not exist in response: {item}")
continue
# Strip any potential whitespaces before checking for empty string.
if str(id).strip() == "" or full_name.strip() == "":
write_item_to_file(f"Either id or full_name is an empty string in response: {item}")
continue
# Make sure at least one character in either id or full_name is alpha
# numeric. It probably is, but we want to catch any weirdness here.
if not at_least_one_char_isalnum(str(id)) or not at_least_one_char_isalnum(full_name):
write_item_to_file(f"Either id or full_name does not have any alpha-numeric chars {item}")
nextSince = 0
if len(data) > 0:
nextSince = data[-1]["id"]
# returning hasNextPage, nextSince
return len(data) == 100, nextSince
def get_public_repos(url: str, headers: {}, since: int):
print("Getting public repos")
counter = 1
while True:
# Sleep for 1 second between every 10 requests to not abuse the API too fast.
if counter % 10 == 0:
time.sleep(1)
# Print progress every 10000 repos (100 requests)
if counter % 100 == 0:
print(f"Total repos processed {counter*100}, now getting repos since ID {since}...")
endpoint = path.join(url, f"repositories?per_page=100&since={since}")
hasNextPage, nextSince = get_repos(endpoint, headers)
if not hasNextPage:
break
since = nextSince
counter += 1
def get_affiliated_repos(url: str, headers: {}, page: int):
print("Getting affiliated repos for user")
counter = 1
while True:
# Sleep for 1 second between every 10 requests to not abuse the API too fast.
if counter % 10 == 0:
time.sleep(1)
# Print progress every 10000 repos (100 requests)
if counter % 100 == 0:
print(f"Total repos processed {counter*100}, now getting repos from page {page}...")
endpoint = path.join(
url,
f"user/repos?sort=created&visibility=all&page={page}&per_page=100",
)
hasNextPage, nextSince = get_repos(endpoint, headers)
if not hasNextPage:
break
page += 1
counter += 1
def write_debug_logs(response: requests.Response):
data = {
"status_code": response.status_code,
"headers": dict(response.headers),
"url": response.url,
"content": str(response.content),
"response_json": response.json(),
}
with open("debug-github-repos.json", "a") as f:
json.dump(data, f)
def write_item_to_file(item: str):
with open("faulty-repos", "a") as f:
f.write(item)
f.write("\n")
def at_least_one_char_isalnum(s: str):
for c in s:
if c.isalnum():
return True
return False
def main(args: argparse.Namespace):
headers = {
"Accept": "application/vnd.github.jean-grey-preview+json,application/vnd.github.mercy-preview+json,application/vnd.github.machine-man-preview+json",
"Authorization": "Token " + args.token,
}
# If neither of --affiliated and --public is set, fetch both.
if args.affiliated is None and args.public is None:
args.affiliated = True
args.public = True
page = 0
if args.affiliated_page is not None and len(args.affiliated_page) == 1:
page = args.affiliated_page[0]
if args.affiliated:
get_affiliated_repos(args.url, headers, page)
since = 0
if args.public_since is not None and len(args.public_since) == 1:
since = args.public_since[0]
if args.public:
get_public_repos(args.url, headers, since)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("url", type=str, help="URL of the API, example: https://api.github.com/v3/")
parser.add_argument("token", type=str, help="Secret token")
parser.add_argument("--affiliated", type=bool, nargs="?", help="Optionally fetch affiliated repos")
parser.add_argument("--public", type=bool, nargs="?", help="Optionally fetch public repos")
parser.add_argument("--affiliated-page", type=int, nargs=1, help="From which page (100 per page) should we start fetching affiliated repos?")
parser.add_argument("--public-since", type=int, nargs=1, help="Since which repo should we start fetching?")
args = parser.parse_args()
# print(args)
main(args)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment