Created
December 30, 2019 18:31
-
-
Save mmsamiei/0fdf1f92a0ff5457269f1464e326eb57 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from pprint import pprint | |
from tqdm import tqdm | |
freader = open('test_random_split.json') | |
data = json.load(freader) | |
print(len(data)) | |
new_dataset = [] | |
for story in data: | |
dialog = story['dialog'] | |
history = "" | |
for turn in tqdm(dialog): | |
if("checked_sentence" in turn and history != ""): | |
for checked_sentence_key in turn['checked_sentence']: | |
checked_sentence = turn['checked_sentence'][checked_sentence_key] | |
for retrieved_passage in turn['retrieved_passages']: | |
for retrieved_passage_title in retrieved_passage: | |
for sentence in retrieved_passage[retrieved_passage_title]: | |
new_row = {} | |
if sentence != checked_sentence: | |
new_row['history'] = history | |
new_row['true_sentence'] = checked_sentence | |
new_row['false_sentenc'] = sentence | |
new_dataset.append(new_row) | |
history = history + " " + turn['text'] | |
len_dataset = len(new_dataset) | |
with open('phase_2_dataset_idea2.json', 'w') as outfile: | |
for i, entry in enumerate(new_dataset): | |
json.dump(entry, outfile) | |
if i != len_dataset-1: | |
outfile.write('\n') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment