Skip to content

Instantly share code, notes, and snippets.

@sabetAI
Created August 16, 2021 15:46
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save sabetAI/eded59f01ef6618d8eb9885c48f501ee to your computer and use it in GitHub Desktop.
Save sabetAI/eded59f01ef6618d8eb9885c48f501ee to your computer and use it in GitHub Desktop.
Scraping Files for Upload to Codex
import os
from argparse import ArgumentParser
import openai
import jsonlines
openai.api_key = os.environ['OPENAI_API_KEY']
# read all typescript files in all subdirectories
def get_all_files(dir, ext):
all_files = []
for root, dirs, files in os.walk(dir):
for file in files:
if file.endswith(ext):
all_files.append(os.path.join(root, file))
return all_files
if __name__ == "__main__":
argparse = ArgumentParser(description="Scrape all typescript files in all subdirectories")
argparse.add_argument("dir", help="directory to search for typescript files")
argparse.add_argument("name", help="reference name for codex upload file")
argparse.add_argument("ext", help="filename extension to scrape")
# get all files for a given directory pass in as command line argument
args = argparse.parse_args()
all_files = get_all_files(args.dir, args.ext)
# read content of files into list
jsons = []
for file in all_files:
with open(file) as f:
content = f.read()
jsons.append({'text' : content, 'metadata' : {'path' : file, 'filename' : os.path.basename(file)}})
# write list of dicts to jsonlines file
with jsonlines.open(f'{args.name}.jsonl', mode='w') as writer:
writer.write_all(jsons)
# write json entries to openai file
openai.File.create(file=open(f'{args.name}.jsonl'), purpose='answers')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment