Created
May 16, 2024 19:47
-
-
Save strawberrymelonpanda/2a94fa9b0872c250690612e263b159a4 to your computer and use it in GitHub Desktop.
Llama.cpp multiple choice tasks encoder
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdio> | |
#include <fstream> | |
#include <string> | |
#include <vector> | |
#include <cstdint> | |
#include "json.hpp" | |
namespace { | |
struct Answers { | |
std::vector<std::string> answers; | |
std::vector<int> labels; | |
void serialize(std::ostream& out) const { | |
uint32_t n = answers.size(); | |
out.write((char *)&n, sizeof(n)); | |
for (auto& a : answers) { | |
uint32_t m = a.size(); | |
out.write((char *)&m, sizeof(m)); | |
out.write(a.data(), m); | |
} | |
out.write((char *)labels.data(), labels.size() * sizeof(int)); | |
} | |
bool deserialize(std::istream& in) { | |
int n; | |
in.read((char *)&n, sizeof(n)); | |
if (in.fail() || n < 0) { | |
return false; | |
} | |
answers.resize(n); | |
labels.resize(n); | |
for (auto& a : answers) { | |
uint32_t m; | |
in.read((char *)&m, sizeof(m)); | |
a.resize(m); | |
in.read((char *)a.data(), m); | |
} | |
in.read((char *)labels.data(), n * sizeof(int)); | |
return !in.fail(); | |
} | |
void fromJson(const nlohmann::json& j) { | |
for (auto& elem : j["answers"]) { | |
answers.push_back(elem.get<std::string>()); | |
} | |
for (auto& elem : j["labels"]) { | |
labels.push_back(elem.get<int>()); | |
} | |
} | |
}; | |
struct MultiplChoice { | |
std::string question; | |
Answers singleCorrect; | |
Answers multipleCorrect; | |
void serialize(std::ostream& out) const { | |
uint32_t n = question.size(); | |
out.write((char *)&n, sizeof(n)); | |
out.write(question.data(), n); | |
singleCorrect.serialize(out); | |
multipleCorrect.serialize(out); | |
} | |
bool deserialize(std::istream& in) { | |
uint32_t n; | |
in.read((char *)&n, sizeof(n)); | |
if (in.fail() || n < 0) { | |
return false; | |
} | |
question.resize(n); | |
in.read((char *)question.data(), n); | |
return singleCorrect.deserialize(in) && multipleCorrect.deserialize(in); | |
} | |
void fromJson(const nlohmann::json& j) { | |
question = j["question"].get<std::string>(); | |
singleCorrect.fromJson(j["single_correct"]); | |
multipleCorrect.fromJson(j["multiple_correct"]); | |
} | |
}; | |
void serialize(std::ostream& out, const std::vector<MultiplChoice>& data) { | |
uint32_t n = data.size(); | |
out.write((char *)&n, sizeof(n)); | |
if (data.empty()) return; | |
std::vector<uint32_t> pos(data.size(), 0); | |
out.write((char *)pos.data(), pos.size() * sizeof(pos[0])); | |
int i = 0; | |
for (auto& d : data) { | |
pos[i++] = out.tellp(); | |
d.serialize(out); | |
} | |
out.seekp(sizeof(n), std::ios::beg); | |
out.write((char *)pos.data(), pos.size() * sizeof(pos[0])); | |
} | |
void encode(const char* jsonFile, const char* binFile) { | |
std::ifstream jsonIn(jsonFile); | |
nlohmann::json jsonData; | |
jsonIn >> jsonData; | |
std::vector<MultiplChoice> data; | |
for (auto& elem : jsonData) { | |
MultiplChoice mc; | |
mc.fromJson(elem); | |
data.push_back(mc); | |
} | |
std::ofstream binOut(binFile, std::ios::binary); | |
serialize(binOut, data); | |
} | |
} | |
int main(int argc, char **argv) { | |
if (argc < 3) { | |
printf("Usage: %s input.json output.bin\n", argv[0]); | |
return 1; | |
} | |
encode(argv[1], argv[2]); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import json | |
import random | |
# Usage: python tojson.py <input_file> <output_file> | |
# Adds a question with the correct answer index shuffled | |
def addQuestion(data, question, answers, labels): | |
indices = list(range(len(answers))) | |
random.shuffle(indices) | |
shuffled_answers = [answers[i] for i in indices] | |
shuffled_labels = [labels[i] for i in indices] | |
data.append({ | |
"multiple_correct": {"answers": [], "labels": []}, | |
"question": f"Question: \"{question}\" Answer:", | |
"single_correct": {"answers": shuffled_answers, "labels": shuffled_labels} | |
}) | |
# Adds a question once for each possible answer | |
# shuffles so that the correct index is used once for possible each position | |
def addMultipleQuestions(data, question, answers, labels): | |
for i in range(len(answers)): | |
correct_answer_index = labels.index(1) | |
shuffled_indices = list(range(len(answers))) | |
shuffled_indices.remove(correct_answer_index) | |
random.shuffle(shuffled_indices) | |
shuffled_indices.insert(i, correct_answer_index) | |
shuffled_answers = [answers[j] for j in shuffled_indices] | |
shuffled_labels = [labels[j] for j in shuffled_indices] | |
data.append({ | |
"multiple_correct": {"answers": [], "labels": []}, | |
"question": f"Question: \"{question}\" Answer:", | |
"single_correct": {"answers": shuffled_answers, "labels": shuffled_labels} | |
}) | |
# Expects the correct answer to be A1 | |
def convert_to_json(file): | |
with open(file, 'r', encoding="utf-8") as f: | |
question = "" | |
answers = [] | |
labels = [] | |
data = [] | |
for line in f: | |
line = line.strip() | |
if not line: | |
addQuestion(data, question, answers, labels) | |
question = "" | |
answers = [] | |
labels = [] | |
elif line.startswith("Q:] "): | |
question = line[4:].strip() | |
elif line.startswith("A1:] "): | |
answers.append(line[5:].strip()) | |
labels.append(1) | |
else: | |
answers.append(line[5:].strip()) | |
labels.append(0) | |
# Pick up the last one | |
addQuestion(data, question, answers, labels) | |
return json.dumps(data, indent=4, ensure_ascii=False) | |
def write_file(filename, data): | |
with open(filename, 'w') as f: | |
f.write(data) | |
if __name__ == "__main__": | |
if len(sys.argv) != 3: | |
print("Usage: python tojson.py <input_file> <output_file>") | |
sys.exit(1) | |
input_file = sys.argv[1] | |
output_file = sys.argv[2] | |
json_data = convert_to_json(input_file) | |
write_file(output_file, json_data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
encode.cpp
will convert a JSON file in the proper format to a.bin
file usable by llama.cpp's --binary-file and --multiple-choice flags.It expects a format of:
Which is the format you'll get if using the convert program here.
Like with that program:
To further simplify the process, I'm including
tojson.py
, which can turn a simpler plan-text format into the proper JSON, to then be further converted to binary. It expects the following format:With a newline between each question set, and the correct answer as the first option. (The answer order will be shuffled in the JSON) As an experiment, I've included a function
addMultipleQuestions
which you can replace the two calls toaddQuestion
with - which will include the same question multiple times for each possible answer, with the correct answer shuffled to each possible position. So far I've not noticed any strong difference in scoring, at the expense of extra processing time. so it's not the default.Use like: