kaysush/generate_submission.py

## generate_submission.py
import json
import pandas as pd
from os import listdir
from os.path import isfile, join

results_base_path = 'jsonl_results'

onlyfiles = [f for f in listdir(results_base_path) if isfile(join(results_base_path, f))]

results = []


def get_file_index(file_name):
    last_part = file_name.split("/")[-1]
    id = last_part.split(".")[0]
    return int(id)


def parse_lines(results, lines):
    for line in lines:
        json_data = json.loads(line)
        file_name = get_file_index(json_data["instance"]["content"])
        prediction_string = json_data["prediction"]["displayNames"][0]
        prediction = 1 if prediction_string == "dog" else 0
        results.append({
            'id': file_name,
            'label' : prediction
        })

for file in onlyfiles:
    print(f"Processing file : {file}")
    file_path = f"{results_base_path}/{file}"
    result_file = open(file_path, 'r')
    result_lines = result_file.readlines()
    parse_lines(results, result_lines)


df = pd.DataFrame(results)
df['id'] = df['id'].astype('int')
df.sort_values(by=['id'], inplace=True)
df.to_csv('results.csv', index=False)
	import json
	import pandas as pd
	from os import listdir
	from os.path import isfile, join

	results_base_path = 'jsonl_results'

	onlyfiles = [f for f in listdir(results_base_path) if isfile(join(results_base_path, f))]

	results = []


	def get_file_index(file_name):
	last_part = file_name.split("/")[-1]
	id = last_part.split(".")[0]
	return int(id)


	def parse_lines(results, lines):
	for line in lines:
	json_data = json.loads(line)
	file_name = get_file_index(json_data["instance"]["content"])
	prediction_string = json_data["prediction"]["displayNames"][0]
	prediction = 1 if prediction_string == "dog" else 0
	results.append({
	'id': file_name,
	'label' : prediction
	})

	for file in onlyfiles:
	print(f"Processing file : {file}")
	file_path = f"{results_base_path}/{file}"
	result_file = open(file_path, 'r')
	result_lines = result_file.readlines()
	parse_lines(results, result_lines)




	df = pd.DataFrame(results)
	df['id'] = df['id'].astype('int')
	df.sort_values(by=['id'], inplace=True)
	df.to_csv('results.csv', index=False)