vovw/dataset-get.py

## dataset-get.py
import ijson
import json

original_dataset_path_1 = "llava_hindi.json"
original_dataset_path_2 = "aadarsh.json"
new_dataset_path = './new.json'


def process_json_file(filename):
    with open(filename, 'rb') as file:
        objects = ijson.items(file, 'item')
        for obj in objects:
            return obj

def get_obj_count(filename):
    cnt = 0
    with open(filename, 'rb') as file:
        objects = ijson.items(file, 'item')
        for obj in objects:
            cnt+=1
            del obj
    return cnt

def modify_json_values(input_filename_1, input_filename_2, output_filename):
    with open(input_filename_1, 'rb') as input_file_1, open(input_filename_2, 'rb') as input_file_2, open(output_filename, 'w', encoding='utf-8') as output_file:
        objects_1 = ijson.items(input_file_1, 'item')
        objects_2 = ijson.items(input_file_2, 'item')
        output_file.write('[')  # Start the JSON array in the output file
        first_item = True
        for obj1 in objects_1:
            if not first_item:
                output_file.write(', ')  # Proper JSON formatting
            else:
                first_item = False
            for conversation1 in obj1:
                for obj2 in objects_2:
                    if conversation1['id'] == obj2.get('id'):
                        print("Match found for ID:", conversation1.get('id'))
                        # Replace the value of "gpt" from conversation 1 with the value from conversation 2
                        if conversation1['from'] == 'gpt':
                            conversation1['gpt'] = obj2['description']
                        pass
            json.dump(obj1, output_file, ensure_ascii=False)

        output_file.write(']')  # Close the JSON array

modify_json_values(original_dataset_path_1, original_dataset_path_2, new_dataset_path)

print("first object in new dataset", process_json_file(new_dataset_path))
print("new dataset count", get_obj_count(new_dataset_path))
	import ijson
	import json

	original_dataset_path_1 = "llava_hindi.json"
	original_dataset_path_2 = "aadarsh.json"
	new_dataset_path = './new.json'


	def process_json_file(filename):
	with open(filename, 'rb') as file:
	objects = ijson.items(file, 'item')
	for obj in objects:
	return obj

	def get_obj_count(filename):
	cnt = 0
	with open(filename, 'rb') as file:
	objects = ijson.items(file, 'item')
	for obj in objects:
	cnt+=1
	del obj
	return cnt

	def modify_json_values(input_filename_1, input_filename_2, output_filename):
	with open(input_filename_1, 'rb') as input_file_1, open(input_filename_2, 'rb') as input_file_2, open(output_filename, 'w', encoding='utf-8') as output_file:
	objects_1 = ijson.items(input_file_1, 'item')
	objects_2 = ijson.items(input_file_2, 'item')
	output_file.write('[') # Start the JSON array in the output file
	first_item = True
	for obj1 in objects_1:
	if not first_item:
	output_file.write(', ') # Proper JSON formatting
	else:
	first_item = False
	for conversation1 in obj1:
	for obj2 in objects_2:
	if conversation1['id'] == obj2.get('id'):
	print("Match found for ID:", conversation1.get('id'))
	# Replace the value of "gpt" from conversation 1 with the value from conversation 2
	if conversation1['from'] == 'gpt':
	conversation1['gpt'] = obj2['description']
	pass
	json.dump(obj1, output_file, ensure_ascii=False)

	output_file.write(']') # Close the JSON array

	modify_json_values(original_dataset_path_1, original_dataset_path_2, new_dataset_path)

	print("first object in new dataset", process_json_file(new_dataset_path))
	print("new dataset count", get_obj_count(new_dataset_path))