Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save JasperVanDenBosch/ee8acd25579e7d867e1f5e42cbcc2632 to your computer and use it in GitHub Desktop.
Save JasperVanDenBosch/ee8acd25579e7d867e1f5e42cbcc2632 to your computer and use it in GitHub Desktop.
Find openneuro datasets that have fmriprep results
"""Find openneuro datasets that have an fmriprep derivative
This is a fairly brute-force method. Takes about one hour, and 8GB of HD space.
requirements:
datalad
pandas
requests
"""
from time import sleep
from tempfile import TemporaryDirectory
from os.path import join, isdir
import json
import datalad.api as datalad
import requests, pandas
github_username = '<YOUR USERNAME>'
github_token = '<YOUR TOKEN>'
max_pages = 50
base_url = 'https://api.github.com/orgs/OpenNeuroDatasets/repos'
ids = []
print('Querying github', end='')
for p in range(1, max_pages+1):
print('.', end='', flush=True)
response = requests.get(base_url+f'?page={p}',
auth=(github_username, github_token))
assert response.status_code == 200
repos = response.json()
for repo in repos:
ids.append(repo['name'])
sleep(0.2) ## be kind to GitHub
if not len(repos):
break
else:
raise ValueError(f'More than {max_pages} pages.')
ids.sort()
print(f'\nFound {len(ids)} repos in {p-1} pages')
fmriprep_datasets = []
cloning_errors = []
with TemporaryDirectory(ignore_cleanup_errors=True) as tmp_dir:
for r, repo_id in enumerate(ids, start=1):
repo_dir = join(tmp_dir, repo_id)
try:
datalad.clone(f'///openneuro/{repo_id}', repo_dir)
except:
cloning_errors.append(repo_id)
continue
if isdir(join(repo_dir, 'derivatives', 'fmriprep')):
with open(join(repo_dir, 'README')) as fhandle:
readme = fhandle.read()
with open(join(repo_dir, 'dataset_description.json')) as fhandle:
metadata = json.load(fhandle)
fmriprep_datasets.append(dict(
did=repo_id,
name=metadata.get('name', 'NO_NAME'),
desc=readme
))
print(f'{r}/{len(ids)}')
sleep(0.2) ## be kind to openneuro.org
print(f'Found {len(fmriprep_datasets)} datasets with fmriprep')
print(f'While encountering {len(cloning_errors)} errors while cloning')
df = pandas.DataFrame(fmriprep_datasets)
df.to_csv('fmriprep_datasets.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment