mmsamiei/phase_2_dataset_idea2.py

## phase_2_dataset_idea2.py
import json
from pprint import pprint
from tqdm import tqdm

freader = open('test_random_split.json')
data = json.load(freader)

print(len(data))

new_dataset = []

for story in data:
    dialog = story['dialog']
    history = ""
    for turn in tqdm(dialog):
        if("checked_sentence" in turn and history != ""):
            for checked_sentence_key in turn['checked_sentence']:
                checked_sentence = turn['checked_sentence'][checked_sentence_key]
                for retrieved_passage in turn['retrieved_passages']:
                    for retrieved_passage_title in retrieved_passage:
                        for sentence in retrieved_passage[retrieved_passage_title]:
                            new_row = {}
                            if sentence != checked_sentence:
                                new_row['history'] = history
                                new_row['true_sentence'] = checked_sentence
                                new_row['false_sentenc'] = sentence
                                new_dataset.append(new_row)

        history = history + " " + turn['text']

len_dataset = len(new_dataset)

with open('phase_2_dataset_idea2.json', 'w') as outfile:
    for i, entry in enumerate(new_dataset):
        json.dump(entry, outfile)
        if i != len_dataset-1:
            outfile.write('\n')
	import json
	from pprint import pprint
	from tqdm import tqdm

	freader = open('test_random_split.json')
	data = json.load(freader)

	print(len(data))

	new_dataset = []

	for story in data:
	dialog = story['dialog']
	history = ""
	for turn in tqdm(dialog):
	if("checked_sentence" in turn and history != ""):
	for checked_sentence_key in turn['checked_sentence']:
	checked_sentence = turn['checked_sentence'][checked_sentence_key]
	for retrieved_passage in turn['retrieved_passages']:
	for retrieved_passage_title in retrieved_passage:
	for sentence in retrieved_passage[retrieved_passage_title]:
	new_row = {}
	if sentence != checked_sentence:
	new_row['history'] = history
	new_row['true_sentence'] = checked_sentence
	new_row['false_sentenc'] = sentence
	new_dataset.append(new_row)

	history = history + " " + turn['text']

	len_dataset = len(new_dataset)

	with open('phase_2_dataset_idea2.json', 'w') as outfile:
	for i, entry in enumerate(new_dataset):
	json.dump(entry, outfile)
	if i != len_dataset-1:
	outfile.write('\n')