Skip to content

Instantly share code, notes, and snippets.

@mmsamiei
Created December 30, 2019 18:31
Show Gist options
  • Save mmsamiei/0fdf1f92a0ff5457269f1464e326eb57 to your computer and use it in GitHub Desktop.
Save mmsamiei/0fdf1f92a0ff5457269f1464e326eb57 to your computer and use it in GitHub Desktop.
import json
from pprint import pprint
from tqdm import tqdm
freader = open('test_random_split.json')
data = json.load(freader)
print(len(data))
new_dataset = []
for story in data:
dialog = story['dialog']
history = ""
for turn in tqdm(dialog):
if("checked_sentence" in turn and history != ""):
for checked_sentence_key in turn['checked_sentence']:
checked_sentence = turn['checked_sentence'][checked_sentence_key]
for retrieved_passage in turn['retrieved_passages']:
for retrieved_passage_title in retrieved_passage:
for sentence in retrieved_passage[retrieved_passage_title]:
new_row = {}
if sentence != checked_sentence:
new_row['history'] = history
new_row['true_sentence'] = checked_sentence
new_row['false_sentenc'] = sentence
new_dataset.append(new_row)
history = history + " " + turn['text']
len_dataset = len(new_dataset)
with open('phase_2_dataset_idea2.json', 'w') as outfile:
for i, entry in enumerate(new_dataset):
json.dump(entry, outfile)
if i != len_dataset-1:
outfile.write('\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment