Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
download github data
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import requests
import datetime
import os
import gzip
from joblib import Parallel, delayed
def no_unicode(df):
# can't store python object in parquet files
types = df.apply(lambda x: pd.api.types.infer_dtype(x.values))
if len(types) > 0:
# python 2 check
for col in types[types == 'unicode'].index:
df[col] = df[col].astype(str)
for col in types[types == 'mixed'].index:
df[col] = df[col].astype(str)
return df
def get_hours(last_date):
""" Returns number of hours (number of files to download)."""
diff = - last_date
days, seconds = diff.days, diff.seconds
hours = days * 24 + seconds // 3600
return hours
def get_data(i, last_date=datetime.datetime(2017,1,1, 1)):
"Update parquet directory with most recent github data."
date = last_date + datetime.timedelta(hours=i)
datestring = f'{date.year}-{date.month:02}-{}-{date.hour}'
url = f'{datestring}.json.gz'
r = requests.get(url)
# write request to disk
filename = f'{datestring}.json.gz'
with open(filename, 'wb') as f:
# parse compressed file into json
lines = []
for line in, 'rb'):
# store as parquet dataframe
df = pd.DataFrame(lines)[['id', 'actor', 'created_at', 'repo', 'type']]
df = no_unicode(df)
df = df.set_index('id')
df.to_parquet('parquet/%s.parquet' % filename.split('.json')[0])
# cleanup
def update_data():
"Download all the things."
dates = [file.split('.')[0] for file in os.listdir('parquet')]
dates = [datetime.datetime.strptime(date, '%Y-%m-%d-%H') for date in dates]
last_date = max(dates)
# waiting sucks, let's try and speed some stuff up
Parallel(n_jobs=10)(delayed(get_data)(i, last_date) for i in range(get_hours(last_date)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.