extract repositories and code from CodeSearchNet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# CodeSearchNet | |
/python*.jsonl | |
/python*.jsonl.gz | |
/python.zip | |
/code.txt | |
/repos.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# extract repositories and code from CodeSearchNet | |
# cf. https://github.com/github/CodeSearchNet | |
# wget "https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip" | |
# unzip | |
# gzip -d python_*.jsonl.gz | |
# python unpack.py --repos > repos.txt | |
# python unpack.py --extract=0k/shyaml;zyga/padme > code.txt | |
# python unpack.py --extract | head -n 100000 > code.txt | |
import json | |
import glob | |
import sys | |
# https://stackoverflow.com/questions/3597480/how-to-make-python-3-print-utf8 | |
sys.stdout.reconfigure(encoding='utf-8') | |
list_repos_mode = '--repos' in sys.argv | |
extract_code = None | |
extract_all = False | |
for arg in sys.argv: | |
if arg.startswith('--extract='): | |
extract_code = arg[10:].split(';') | |
elif arg == '--extract': | |
extract_all = True | |
interesting = ['0k/shyaml'] | |
repos = set() | |
for fname in glob.glob('python_*.jsonl'): | |
for line in open(fname, 'r').read().splitlines(): | |
js = json.loads(line) | |
repos.add(js['repo']) | |
if extract_all or (extract_code and js['repo'] in extract_code): | |
print(js['code']) | |
print() | |
if list_repos_mode: | |
for repo in sorted(repos): | |
print(repo) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment