Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
from distributed import Client, LocalCluster
import dask.dataframe as dd
import numpy as np
cluster = LocalCluster(ip='', n_workers=32, threads_per_worker=1, diagnostics_port=8787, **{'memory_limit': 2e9})
client = Client(cluster)
df = dd.read_parquet('parquet/')
print(f'found {len(df)} interactions')
df['user_id'] = df['actor'].apply(lambda x: ast.literal_eval(x).get('login', 'unknown'), meta=('x', 'U'))
df['repo_id'] = df['repo'].apply(lambda x: ast.literal_eval(x).get('name', 'unkown'), meta=('x', 'U'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment