Skip to content

Instantly share code, notes, and snippets.

@billju
Last active September 23, 2023 08:53
Show Gist options
  • Save billju/151fca95002f17e96a023cce5877d4f3 to your computer and use it in GitHub Desktop.
Save billju/151fca95002f17e96a023cce5877d4f3 to your computer and use it in GitHub Desktop.
import os, json
from glob import glob
prompt_input = (
"Below is an instruction that describes a task, paired with an input that provides further context. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
)
prompt_no_input = (
"Below is an instruction that describes a task. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Response:"
)
rows = []
for path in glob('*.json'):
for row in json.load(open(path,'r',encoding='utf8')):
rows.append(row[0] if isinstance(row, list) else row)
for path in glob('*.jsonl'):
for line in open(path,'r',encoding='utf8').read().splitlines():
rows.append(json.loads(line))
for path in glob('*.md'):
for block in open(path,'r',encoding='utf8').read().split('\n\n'):
lines = block.splitlines()
rows.append({'instruction':lines[0],'input':'','output':'\n'.join(lines[1:])})
train_json = [{'text':(prompt_input if row['input'] else prompt_no_input).format(**row)} for row in rows]
if not os.path.exists('data'): os.mkdir('data')
json.dump(train_json, open('data/train.json','w',encoding='utf8'),ensure_ascii=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment