Skip to content

Instantly share code, notes, and snippets.

@vovw
Created April 23, 2024 19:20
Show Gist options
  • Save vovw/ed472bb636b215a1ddb9e4487a63c51b to your computer and use it in GitHub Desktop.
Save vovw/ed472bb636b215a1ddb9e4487a63c51b to your computer and use it in GitHub Desktop.
import ijson
import json
original_dataset_path_1 = "llava_hindi.json"
original_dataset_path_2 = "aadarsh.json"
new_dataset_path = './new.json'
def process_json_file(filename):
with open(filename, 'rb') as file:
objects = ijson.items(file, 'item')
for obj in objects:
return obj
def get_obj_count(filename):
cnt = 0
with open(filename, 'rb') as file:
objects = ijson.items(file, 'item')
for obj in objects:
cnt+=1
del obj
return cnt
def modify_json_values(input_filename_1, input_filename_2, output_filename):
with open(input_filename_1, 'rb') as input_file_1, open(input_filename_2, 'rb') as input_file_2, open(output_filename, 'w', encoding='utf-8') as output_file:
objects_1 = ijson.items(input_file_1, 'item')
objects_2 = ijson.items(input_file_2, 'item')
output_file.write('[') # Start the JSON array in the output file
first_item = True
for obj1 in objects_1:
if not first_item:
output_file.write(', ') # Proper JSON formatting
else:
first_item = False
for conversation1 in obj1:
for obj2 in objects_2:
if conversation1['id'] == obj2.get('id'):
print("Match found for ID:", conversation1.get('id'))
# Replace the value of "gpt" from conversation 1 with the value from conversation 2
if conversation1['from'] == 'gpt':
conversation1['gpt'] = obj2['description']
pass
json.dump(obj1, output_file, ensure_ascii=False)
output_file.write(']') # Close the JSON array
modify_json_values(original_dataset_path_1, original_dataset_path_2, new_dataset_path)
print("first object in new dataset", process_json_file(new_dataset_path))
print("new dataset count", get_obj_count(new_dataset_path))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment