Skip to content

Instantly share code, notes, and snippets.

@strawberrymelonpanda
Created May 16, 2024 19:47
Show Gist options
  • Save strawberrymelonpanda/2a94fa9b0872c250690612e263b159a4 to your computer and use it in GitHub Desktop.
Save strawberrymelonpanda/2a94fa9b0872c250690612e263b159a4 to your computer and use it in GitHub Desktop.
Llama.cpp multiple choice tasks encoder
#include <cstdio>
#include <fstream>
#include <string>
#include <vector>
#include <cstdint>
#include "json.hpp"
namespace {
struct Answers {
std::vector<std::string> answers;
std::vector<int> labels;
void serialize(std::ostream& out) const {
uint32_t n = answers.size();
out.write((char *)&n, sizeof(n));
for (auto& a : answers) {
uint32_t m = a.size();
out.write((char *)&m, sizeof(m));
out.write(a.data(), m);
}
out.write((char *)labels.data(), labels.size() * sizeof(int));
}
bool deserialize(std::istream& in) {
int n;
in.read((char *)&n, sizeof(n));
if (in.fail() || n < 0) {
return false;
}
answers.resize(n);
labels.resize(n);
for (auto& a : answers) {
uint32_t m;
in.read((char *)&m, sizeof(m));
a.resize(m);
in.read((char *)a.data(), m);
}
in.read((char *)labels.data(), n * sizeof(int));
return !in.fail();
}
void fromJson(const nlohmann::json& j) {
for (auto& elem : j["answers"]) {
answers.push_back(elem.get<std::string>());
}
for (auto& elem : j["labels"]) {
labels.push_back(elem.get<int>());
}
}
};
struct MultiplChoice {
std::string question;
Answers singleCorrect;
Answers multipleCorrect;
void serialize(std::ostream& out) const {
uint32_t n = question.size();
out.write((char *)&n, sizeof(n));
out.write(question.data(), n);
singleCorrect.serialize(out);
multipleCorrect.serialize(out);
}
bool deserialize(std::istream& in) {
uint32_t n;
in.read((char *)&n, sizeof(n));
if (in.fail() || n < 0) {
return false;
}
question.resize(n);
in.read((char *)question.data(), n);
return singleCorrect.deserialize(in) && multipleCorrect.deserialize(in);
}
void fromJson(const nlohmann::json& j) {
question = j["question"].get<std::string>();
singleCorrect.fromJson(j["single_correct"]);
multipleCorrect.fromJson(j["multiple_correct"]);
}
};
void serialize(std::ostream& out, const std::vector<MultiplChoice>& data) {
uint32_t n = data.size();
out.write((char *)&n, sizeof(n));
if (data.empty()) return;
std::vector<uint32_t> pos(data.size(), 0);
out.write((char *)pos.data(), pos.size() * sizeof(pos[0]));
int i = 0;
for (auto& d : data) {
pos[i++] = out.tellp();
d.serialize(out);
}
out.seekp(sizeof(n), std::ios::beg);
out.write((char *)pos.data(), pos.size() * sizeof(pos[0]));
}
void encode(const char* jsonFile, const char* binFile) {
std::ifstream jsonIn(jsonFile);
nlohmann::json jsonData;
jsonIn >> jsonData;
std::vector<MultiplChoice> data;
for (auto& elem : jsonData) {
MultiplChoice mc;
mc.fromJson(elem);
data.push_back(mc);
}
std::ofstream binOut(binFile, std::ios::binary);
serialize(binOut, data);
}
}
int main(int argc, char **argv) {
if (argc < 3) {
printf("Usage: %s input.json output.bin\n", argv[0]);
return 1;
}
encode(argv[1], argv[2]);
return 0;
}
import sys
import json
import random
# Usage: python tojson.py <input_file> <output_file>
# Adds a question with the correct answer index shuffled
def addQuestion(data, question, answers, labels):
indices = list(range(len(answers)))
random.shuffle(indices)
shuffled_answers = [answers[i] for i in indices]
shuffled_labels = [labels[i] for i in indices]
data.append({
"multiple_correct": {"answers": [], "labels": []},
"question": f"Question: \"{question}\" Answer:",
"single_correct": {"answers": shuffled_answers, "labels": shuffled_labels}
})
# Adds a question once for each possible answer
# shuffles so that the correct index is used once for possible each position
def addMultipleQuestions(data, question, answers, labels):
for i in range(len(answers)):
correct_answer_index = labels.index(1)
shuffled_indices = list(range(len(answers)))
shuffled_indices.remove(correct_answer_index)
random.shuffle(shuffled_indices)
shuffled_indices.insert(i, correct_answer_index)
shuffled_answers = [answers[j] for j in shuffled_indices]
shuffled_labels = [labels[j] for j in shuffled_indices]
data.append({
"multiple_correct": {"answers": [], "labels": []},
"question": f"Question: \"{question}\" Answer:",
"single_correct": {"answers": shuffled_answers, "labels": shuffled_labels}
})
# Expects the correct answer to be A1
def convert_to_json(file):
with open(file, 'r', encoding="utf-8") as f:
question = ""
answers = []
labels = []
data = []
for line in f:
line = line.strip()
if not line:
addQuestion(data, question, answers, labels)
question = ""
answers = []
labels = []
elif line.startswith("Q:] "):
question = line[4:].strip()
elif line.startswith("A1:] "):
answers.append(line[5:].strip())
labels.append(1)
else:
answers.append(line[5:].strip())
labels.append(0)
# Pick up the last one
addQuestion(data, question, answers, labels)
return json.dumps(data, indent=4, ensure_ascii=False)
def write_file(filename, data):
with open(filename, 'w') as f:
f.write(data)
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python tojson.py <input_file> <output_file>")
sys.exit(1)
input_file = sys.argv[1]
output_file = sys.argv[2]
json_data = convert_to_json(input_file)
write_file(output_file, json_data)
@strawberrymelonpanda
Copy link
Author

strawberrymelonpanda commented May 16, 2024

encode.cpp will convert a JSON file in the proper format to a .bin file usable by llama.cpp's --binary-file and --multiple-choice flags.

It expects a format of:

[
	{ 
            "multiple_correct": {"answers": [], "labels": [] },
	    "question": "Question: \"QUESTION TEXT\" Answer:",
		"single_correct": {
			"answers": [
				"ANSWER TEXT 1",
				"ANSWER TEXT 2",
				"ANSWER TEXT 3",
				"ANSWER TEXT 4"
			],
			"labels": [0,0,0,1]
		}
	},
        { 
            "multiple_correct": {"answers": [], "labels": [] },
	    "question": "Question: \"QUESTION TEXT\" Answer:",
		"single_correct": {
			"answers": [
				"ANSWER TEXT 1",
				"ANSWER TEXT 2",
				"ANSWER TEXT 3",
				"ANSWER TEXT 4"
			],
			"labels": [0,0,0,1]
		}
	}
]

Which is the format you'll get if using the convert program here.

Like with that program:

g++ -o encode encode.cpp
./encode arc-easy-validation.json arc-easy-validation.bin
./perplexity -m model -bf arc-easy-validation.bin --multiple-choice

To further simplify the process, I'm including tojson.py, which can turn a simpler plan-text format into the proper JSON, to then be further converted to binary. It expects the following format:

Q:] QUESTION TEXT
A1:] CORRECT ANSWER TEXT
A2:] ANSWER
A3:] ANSWER
A4:] ANSWER

Q:] QUESTION TEXT
A1:] CORRECT ANSWER TEXT
A2:] ANSWER
A3:] ANSWER
A4:] ANSWER

With a newline between each question set, and the correct answer as the first option. (The answer order will be shuffled in the JSON) As an experiment, I've included a function addMultipleQuestions which you can replace the two calls to addQuestion with - which will include the same question multiple times for each possible answer, with the correct answer shuffled to each possible position. So far I've not noticed any strong difference in scoring, at the expense of extra processing time. so it's not the default.

Use like:

python tojson.py custom-test.txt custom-test.json
./encode custom-test.json custom-test.bin
./perplexity -m model -bf custom-test.bin --multiple choice

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment