Skip to content

Instantly share code, notes, and snippets.

Last active February 28, 2022 15:10
What would you like to do?
extract repositories and code from CodeSearchNet
# CodeSearchNet
#!/usr/bin/env python
# extract repositories and code from CodeSearchNet
# cf.
# wget ""
# unzip
# gzip -d python_*.jsonl.gz
# python --repos > repos.txt
# python --extract=0k/shyaml;zyga/padme > code.txt
# python --extract | head -n 100000 > code.txt
import json
import glob
import sys
list_repos_mode = '--repos' in sys.argv
extract_code = None
extract_all = False
for arg in sys.argv:
if arg.startswith('--extract='):
extract_code = arg[10:].split(';')
elif arg == '--extract':
extract_all = True
interesting = ['0k/shyaml']
repos = set()
for fname in glob.glob('python_*.jsonl'):
for line in open(fname, 'r').read().splitlines():
js = json.loads(line)
if extract_all or (extract_code and js['repo'] in extract_code):
if list_repos_mode:
for repo in sorted(repos):
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment