Skip to content

Instantly share code, notes, and snippets.

@ArtemGr
Last active February 28, 2022 15:10
Show Gist options
  • Save ArtemGr/429e00605d0487615ae899d7bf5e9420 to your computer and use it in GitHub Desktop.
Save ArtemGr/429e00605d0487615ae899d7bf5e9420 to your computer and use it in GitHub Desktop.
extract repositories and code from CodeSearchNet
# CodeSearchNet
/python*.jsonl
/python*.jsonl.gz
/python.zip
/code.txt
/repos.txt
#!/usr/bin/env python
# extract repositories and code from CodeSearchNet
# cf. https://github.com/github/CodeSearchNet
# wget "https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip"
# unzip
# gzip -d python_*.jsonl.gz
# python unpack.py --repos > repos.txt
# python unpack.py --extract=0k/shyaml;zyga/padme > code.txt
# python unpack.py --extract | head -n 100000 > code.txt
import json
import glob
import sys
# https://stackoverflow.com/questions/3597480/how-to-make-python-3-print-utf8
sys.stdout.reconfigure(encoding='utf-8')
list_repos_mode = '--repos' in sys.argv
extract_code = None
extract_all = False
for arg in sys.argv:
if arg.startswith('--extract='):
extract_code = arg[10:].split(';')
elif arg == '--extract':
extract_all = True
interesting = ['0k/shyaml']
repos = set()
for fname in glob.glob('python_*.jsonl'):
for line in open(fname, 'r').read().splitlines():
js = json.loads(line)
repos.add(js['repo'])
if extract_all or (extract_code and js['repo'] in extract_code):
print(js['code'])
print()
if list_repos_mode:
for repo in sorted(repos):
print(repo)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment