Created
October 10, 2017 08:57
-
-
Save dufferzafar/a6e7c84dab16bc6ecf1498dde6dfcd66 to your computer and use it in GitHub Desktop.
Calculate size of GitHub Archive data for a date range
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Find out total size of the JSON files of GitHub Archive | |
in a particular date range. | |
""" | |
import requests | |
from datetime import timedelta, datetime | |
URL = "http://data.githubarchive.org/%s.json.gz" | |
def hour_range(start, end): | |
for n in range(int((end - start).days)): | |
for h in range(0, 24): | |
yield start + timedelta(days=n, hours=h) | |
def human_bytes(num, suffix='B'): | |
for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']: | |
if abs(num) < 1024.0: | |
return "%3.1f%s%s" % (num, unit, suffix) | |
num /= 1024.0 | |
return "%.1f%s%s" % (num, 'Y', suffix) | |
start = datetime(2017, 1, 1) | |
end = datetime(2017, 10, 1) | |
total = 0 | |
for hour in hour_range(start, end): | |
url = URL % hour.strftime("%Y-%m-%d-%-H") | |
size = int(requests.head(url).headers["Content-Length"]) | |
total += size | |
print("%s,%s" % (human_bytes(size), url)) | |
print(human_bytes(total)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
python ghsize.py > GHA_2017.txt