Last active
August 1, 2023 06:10
-
-
Save AndyGrant/b3a70ea5f5ff4285aa60076381ee30c4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <argp.h> | |
#include <stdbool.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <dirent.h> | |
#include <sys/stat.h> | |
#include <sys/types.h> | |
const char *format = "tmp.byteshuf.partial.%d"; | |
typedef struct Data { | |
uint64_t occupied; // 8-byte occupancy bitboard ( No Kings ) | |
int16_t eval; // 2-byte int for the target evaluation | |
uint8_t result; // 1-byte int for result. { L=0, D=1, W=2 } | |
uint8_t turn; // 1-byte int for the side-to-move flag | |
uint8_t wking; // 1-byte int for the White King Square | |
uint8_t bking; // 1-byte int for the Black King Square | |
uint8_t packed[15]; // 1-byte int per two non-King pieces | |
} Data; | |
struct arguments { | |
bool verbose; | |
size_t per_file, chunk_size; | |
char *input, *output, *directory; | |
}; | |
static struct argp_option options[] = { | |
{ "verbose", 'v', 0, 0, "Produce verbose output", 0 }, | |
{ "directory", 'd', "", 0, "Directory of files unshuffled bytes", 1 }, | |
{ "input", 'i', "", 0, "Source file of unshuffled bytes", 1 }, | |
{ "output", 'o', "", 0, "Output file for shuffled bytes", 1 }, | |
{ "per-file", 'n', "", 0, "Samples per output file (33554432)", 2 }, | |
{ "chunk-size", 's', "", 0, "Samples per interim file (33554432)", 2 }, | |
{ 0 }, | |
}; | |
static error_t parse_opt(int key, char *arg, struct argp_state *state) { | |
struct arguments *arguments = state->input; | |
switch (key) { | |
case 'v': arguments->verbose = true; break; | |
case 'd': arguments->directory = strdup(arg); break; | |
case 'i': arguments->input = strdup(arg); break; | |
case 'o': arguments->output = strdup(arg); break; | |
case 'n': arguments->per_file = atoll(arg); break; | |
case 's': arguments->chunk_size = atoll(arg); break; | |
case ARGP_KEY_ARG: return 0; | |
default: return ARGP_ERR_UNKNOWN; | |
} | |
return 0; | |
} | |
static bool is_directory(const char *fname) { | |
struct stat path_stat; | |
stat(fname, &path_stat); | |
return S_ISDIR(path_stat.st_mode); | |
} | |
static FILE* open_file_in_directory(struct arguments *args, const char *fname) { | |
char tempfile[512]; | |
sprintf(tempfile, "%s%s", args->directory, fname); | |
return fopen(tempfile, "rb"); | |
} | |
static uint64_t rand64() { | |
// http://vigna.di.unimi.it/ftp/papers/xorshift.pdf | |
static uint64_t seed = 1070372ull; | |
seed ^= seed >> 12; seed ^= seed << 25; seed ^= seed >> 27; | |
return seed * 2685821657736338717ull; | |
} | |
static void swap_data(Data *data, int x, int y) { | |
Data temp = data[x]; data[x] = data[y]; data[y] = temp; | |
} | |
static void shuffle_and_save(struct arguments *args, size_t count, Data *data, int *nfiles) { | |
char tempfile[512]; | |
sprintf(tempfile, format, (*nfiles)++); | |
for (size_t i = 0; i < count; i++) | |
swap_data(data, i, rand64() % count); | |
if (count) { | |
FILE *fout = fopen(tempfile, "wb"); | |
fwrite(data, sizeof(Data), count, fout); | |
fclose(fout); | |
} | |
if (args->verbose) | |
printf("Read %zd entries and saved to %s\n", count, tempfile); | |
} | |
static size_t process_input_file(struct arguments *args, FILE *fin, Data *data, size_t leftovers, int *nfiles) { | |
size_t entries_read; | |
do { | |
const size_t read_size = args->chunk_size - leftovers; | |
entries_read = fread(&data[leftovers], sizeof(Data), read_size, fin); | |
if (entries_read == read_size) | |
shuffle_and_save(args, args->chunk_size, data, nfiles); | |
leftovers = (entries_read == read_size) ? 0 : entries_read + leftovers; | |
} while (!leftovers && entries_read); | |
return entries_read ? leftovers : 0; | |
} | |
static size_t read_all_input_files(struct arguments *args, int *nfiles) { | |
size_t leftovers = 0; | |
Data *data = malloc(sizeof(Data) * args->chunk_size); | |
if (args->directory != NULL) { | |
struct dirent *dirent; | |
DIR *directory = opendir(args->directory); | |
if (!directory) { | |
printf("Unable to read directory: %s\n", args->directory); | |
exit(EXIT_FAILURE); | |
} | |
while ((dirent = readdir(directory)) != NULL) { | |
if (dirent->d_name[0] == '.') | |
continue; | |
FILE *fin = open_file_in_directory(args, dirent->d_name); | |
leftovers = process_input_file(args, fin, data, leftovers, nfiles); | |
fclose(fin); | |
} | |
closedir(directory); | |
} | |
if (args->input != NULL) { | |
FILE *fin = fopen(args->input, "rb"); | |
leftovers = process_input_file(args, fin, data, leftovers, nfiles); | |
fclose(fin); | |
} | |
if (leftovers) | |
shuffle_and_save(args, leftovers, data, nfiles); | |
return leftovers; | |
} | |
static FILE *open_output_file(struct arguments *args, int out_idx) { | |
/// When using a single output file, simply open the output filename | |
/// that was provided. Otherwise, open files appending .%d for the out_idx | |
if (!args->per_file) | |
return fopen(args->output, "wb"); | |
char fname[512]; | |
sprintf(fname, "%s.%d", args->output, out_idx); | |
return fopen(fname, "wb"); | |
} | |
static void pop_and_save(FILE *fout, FILE **partials, size_t *remaining, size_t total_remaining) { | |
/// Randomly select a partial file, weighted based on the remaining entries in each | |
/// partial file, and then pop off the top entry from that file and save it. Close | |
/// partial files once all of the entries in the file have been popped off | |
Data data; | |
int input_idx = 0; | |
uint64_t pop = rand64() % total_remaining; | |
while (pop >= remaining[input_idx]) | |
pop -= remaining[input_idx++]; | |
if (fread(&data, sizeof(Data), 1, partials[input_idx]) != (size_t) 1) | |
printf("Error trying to read files...\n"); | |
fwrite(&data, sizeof(Data), 1, fout); | |
if (!--remaining[input_idx]) | |
fclose(partials[input_idx]); | |
} | |
static void close_output_file(struct arguments *args, int out_idx, FILE *fout, size_t total, size_t saved) { | |
/// Close file and report progress on total outputs when in verbose mode | |
if (args->verbose && !args->per_file) | |
printf("Finished writing to %s (%zd of %zd)\n", args->output, saved, total); | |
else if (args->verbose) | |
printf("Finished writing to %s.%d (%zd of %zd)\n", args->output, out_idx, saved, total); | |
fclose(fout); | |
} | |
static void output_from_partials(struct arguments *args, FILE **partials, size_t *remaining, int nfiles) { | |
/// While there are still entries in the data files, grab one at a time | |
/// randomly and save it to the output file. Use multiple, fixed length, | |
/// output files unless told otherwise. Randomization is a function of the | |
/// number of remaining entries in a given partial file at the current time | |
const size_t total_entries = remaining[nfiles - 1] + args->chunk_size * (nfiles - 1); | |
const size_t entries_per = !args->per_file ? total_entries : args->per_file; | |
const int cnt_output_files = !args->per_file ? 1 : 1 + (total_entries - 1) / args->per_file; | |
size_t total_saved = 0; | |
for (int out_idx = 0; out_idx < cnt_output_files; out_idx++) { | |
FILE *fout = open_output_file(args, out_idx); | |
for (size_t i = 0; i < entries_per && total_entries != total_saved; i++, total_saved++) | |
pop_and_save(fout, partials, remaining, total_entries - total_saved); | |
close_output_file(args, out_idx, fout, total_entries, total_saved); | |
} | |
} | |
int main(int argc, char *argv[]) { | |
struct argp argp = { options, parse_opt, "", "" }; | |
struct arguments arguments = { false, 33554432, 33554432, NULL, NULL, NULL }; | |
argp_parse(&argp, argc, argv, 0, 0, &arguments); | |
if (arguments.verbose) { | |
const int file_size = sizeof(Data) * arguments.chunk_size / (1024 * 1024); | |
printf("Using tempfiles of size %dMB\n", file_size); | |
printf("Saving %zd entries per output file\n", arguments.per_file); | |
printf("Storing %zd entries per interim file\n", arguments.chunk_size); | |
} | |
int nfiles = 0; | |
FILE **partials; | |
size_t *remaining; | |
size_t leftovers = read_all_input_files(&arguments, &nfiles); | |
partials = malloc(nfiles * sizeof(FILE* )); | |
remaining = malloc(nfiles * sizeof(size_t)); | |
for (int i = 0; i < nfiles; i++) { | |
char tempfile[512]; | |
sprintf(tempfile, format, i); | |
partials[i] = fopen(tempfile, "rb"); | |
remaining[i] = arguments.chunk_size; | |
} | |
if (leftovers) remaining[nfiles-1] = leftovers; | |
output_from_partials(&arguments, partials, remaining, nfiles); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment