Created
April 23, 2024 19:20
-
-
Save vovw/ed472bb636b215a1ddb9e4487a63c51b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ijson | |
import json | |
original_dataset_path_1 = "llava_hindi.json" | |
original_dataset_path_2 = "aadarsh.json" | |
new_dataset_path = './new.json' | |
def process_json_file(filename): | |
with open(filename, 'rb') as file: | |
objects = ijson.items(file, 'item') | |
for obj in objects: | |
return obj | |
def get_obj_count(filename): | |
cnt = 0 | |
with open(filename, 'rb') as file: | |
objects = ijson.items(file, 'item') | |
for obj in objects: | |
cnt+=1 | |
del obj | |
return cnt | |
def modify_json_values(input_filename_1, input_filename_2, output_filename): | |
with open(input_filename_1, 'rb') as input_file_1, open(input_filename_2, 'rb') as input_file_2, open(output_filename, 'w', encoding='utf-8') as output_file: | |
objects_1 = ijson.items(input_file_1, 'item') | |
objects_2 = ijson.items(input_file_2, 'item') | |
output_file.write('[') # Start the JSON array in the output file | |
first_item = True | |
for obj1 in objects_1: | |
if not first_item: | |
output_file.write(', ') # Proper JSON formatting | |
else: | |
first_item = False | |
for conversation1 in obj1: | |
for obj2 in objects_2: | |
if conversation1['id'] == obj2.get('id'): | |
print("Match found for ID:", conversation1.get('id')) | |
# Replace the value of "gpt" from conversation 1 with the value from conversation 2 | |
if conversation1['from'] == 'gpt': | |
conversation1['gpt'] = obj2['description'] | |
pass | |
json.dump(obj1, output_file, ensure_ascii=False) | |
output_file.write(']') # Close the JSON array | |
modify_json_values(original_dataset_path_1, original_dataset_path_2, new_dataset_path) | |
print("first object in new dataset", process_json_file(new_dataset_path)) | |
print("new dataset count", get_obj_count(new_dataset_path)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment