ArtemGr/.gitignore

## .gitignore
# CodeSearchNet
/python*.jsonl
/python*.jsonl.gz
/python.zip
/code.txt
/repos.txt

## unpack.py
#!/usr/bin/env python

# extract repositories and code from CodeSearchNet
# cf. https://github.com/github/CodeSearchNet

# wget "https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip"
# unzip
# gzip -d python_*.jsonl.gz
# python unpack.py --repos > repos.txt
# python unpack.py --extract=0k/shyaml;zyga/padme > code.txt
# python unpack.py --extract | head -n 100000 > code.txt

import json
import glob
import sys

# https://stackoverflow.com/questions/3597480/how-to-make-python-3-print-utf8
sys.stdout.reconfigure(encoding='utf-8')

list_repos_mode = '--repos' in sys.argv
extract_code = None
extract_all = False
for arg in sys.argv:
  if arg.startswith('--extract='):
    extract_code = arg[10:].split(';')
  elif arg == '--extract':
    extract_all = True

interesting = ['0k/shyaml']

repos = set()

for fname in glob.glob('python_*.jsonl'):

  for line in open(fname, 'r').read().splitlines():
    js = json.loads(line)
    repos.add(js['repo'])
    if extract_all or (extract_code and js['repo'] in extract_code):
      print(js['code'])
      print()

if list_repos_mode:
  for repo in sorted(repos):
    print(repo)
	# CodeSearchNet
	/python*.jsonl
	/python*.jsonl.gz
	/python.zip
	/code.txt
	/repos.txt
	#!/usr/bin/env python

	# extract repositories and code from CodeSearchNet
	# cf. https://github.com/github/CodeSearchNet

	# wget "https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip"
	# unzip
	# gzip -d python_*.jsonl.gz
	# python unpack.py --repos > repos.txt
	# python unpack.py --extract=0k/shyaml;zyga/padme > code.txt
	# python unpack.py --extract \| head -n 100000 > code.txt

	import json
	import glob
	import sys

	# https://stackoverflow.com/questions/3597480/how-to-make-python-3-print-utf8
	sys.stdout.reconfigure(encoding='utf-8')

	list_repos_mode = '--repos' in sys.argv
	extract_code = None
	extract_all = False
	for arg in sys.argv:
	if arg.startswith('--extract='):
	extract_code = arg[10:].split(';')
	elif arg == '--extract':
	extract_all = True

	interesting = ['0k/shyaml']

	repos = set()

	for fname in glob.glob('python_*.jsonl'):

	for line in open(fname, 'r').read().splitlines():
	js = json.loads(line)
	repos.add(js['repo'])
	if extract_all or (extract_code and js['repo'] in extract_code):
	print(js['code'])
	print()

	if list_repos_mode:
	for repo in sorted(repos):
	print(repo)