Skip to content

Instantly share code, notes, and snippets.

@danyaljj
Created July 21, 2021 16:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danyaljj/ca050aa100722ab8d8539061e5356699 to your computer and use it in GitHub Desktop.
Save danyaljj/ca050aa100722ab8d8539061e5356699 to your computer and use it in GitHub Desktop.
from datasets import load_dataset
dataset = load_dataset("eli5")
print(dataset)
path = "/Users/danielk/ideaProjects/qoogle-experiments/data"
trainfile = open(path + "/eli5/train.tsv", "w")
testfile = open(path + "/eli5/test.tsv", "w")
devfile = open(path + "/eli5/dev.tsv", "w")
def safe(split, outfile, is_eval):
for x in dataset[split]:
if is_eval:
ans = "///".join([a for a in x['answers']['text']]).replace("\t", " ").replace("\n", " ")
else:
ids_and_scores = list(enumerate(x['answers']['score']))
ids_and_scores = sorted(ids_and_scores, key=lambda x: -x[1])
max_score_idx = ids_and_scores[0][0]
ans = x['answers']['text'][max_score_idx].replace("\t", " ").replace("\n", " ")
question = x['title'].replace("\t", " ").replace("\n", " ")
if len(question.split(" ")) > 510:
question = " ".join(question.split(" ")[:510])
if len(ans.split(" ")) > 510:
ans = " ".join(ans.split(" ")[:510])
outfile.write(f"{question} \t {ans} \n")
safe('validation_eli5', devfile, True)
safe('test_eli5', testfile, True)
safe('train_eli5', trainfile, False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment