rbnelr/gist:0534beeec324dfe88fdbd5d8be3b28e5

## gistfile1.txt
#define _CRT_SECURE_NO_WARNINGS

// img loading
#define STB_IMAGE_IMPLEMENTATION

#define STBI_ONLY_BMP	1
#define STBI_ONLY_PNG	1
#define STBI_ONLY_TGA	1
#define STBI_ONLY_JPEG	1

#include "deps/stb/stb_image.h"


#include "../imgl/imgl.hpp"
using namespace imgl::basic_typedefs;
using namespace imgl::math;
typedef imgl::s32v2		iv2;
typedef imgl::fv2		v2;

typedef imgl::rgba8		rgba8;
typedef imgl::rgbf		rgbf;
typedef imgl::rgbaf		rgbaf;

using imgl::lerp;
using imgl::map;

#include <vector>
#include <memory>
using std::unique_ptr;
using std::make_unique;

#include <unordered_map>
#include <algorithm>

int stream_textures_cur = 0;
std::unordered_map<int, unique_ptr<imgl::Texture2D>> stream_textures;

void end_stream_textures () {
	stream_textures.clear(); // delete textures that were not drawn (created in imgui_stream_texture but not displayed and deleted in begin_user_texture, for ex. if they are off-screen)
	stream_textures_cur = 0;
}

BEGIN_USER_TEXTURE_CALLBACK(begin_user_texture) {
	auto ret = stream_textures.find((int)(uptr)id);
	assert(ret != stream_textures.end());

	auto tex = std::move( ret->second );

	stream_textures.erase(ret->first);

	return tex;
}
// no end_user_texture, we want the lib to simply drop (delete) the texture after displaying it

void imgui_stream_texture (rgbf const* pixels, iv2 size, v2 display_size, bool nearest_filter=true) {
	stream_textures[stream_textures_cur] = imgl::imgui_make_user_texture(imgl::PF_LRGBF, pixels, size, nearest_filter ? imgl::FILTER_NEAREST : imgl::FILTER_BILINEAR);

	ImGui::Image((ImTextureID*)(uptr)stream_textures_cur, ImVec2(display_size.x ? display_size.x : ImGui::GetContentRegionAvailWidth(),display_size.y));

	++stream_textures_cur;
}
void imgui_stream_texture (u8 const* pixels, iv2 size, v2 display_size, bool nearest_filter=true) {
	auto rgb = make_unique<rgbf[]>( size.x * size.y );

	for (int i=0; i<(size.x * size.y); ++i) {
		rgb[i] = rgbf( imgl::to_linear((f32)pixels[i] / 255.0f) );
	}

	imgui_stream_texture(rgb.get(), size, display_size, nearest_filter);
}

/*
	Trying to implement a neural network evaluator and trainer after having watched 3brown1blue's neural network videos:
		https://www.youtube.com/watch?v=aircAruvnKk

	shorthand definitions:

	nn = neural network

	first layer:	each neuron only has activation (pixel value [0,1] of input image)
	hidden layers:	each neuron has weights[layer[i-1].neurons.count] + bias + activation (when evaluating)
	final layer:	like hidden layers, but we define a meaning to it's neurons activations (and a expected activation for each input set)

	input layer:	the first layer
	real neuron:	all neurons which have wheights and a bias, ie. not the neurons of the first layer, since they are just conceptual neurons, they do not process signals, they define the source signal (i guess you could call them sensors)
	real layer:		see above, each layer except the input layer
	output layer:	the last layer

*/

s32 bswap (s32 i) {
	return	((i << 24) & 0xff000000)|
		((i <<  8) & 0x00ff0000)|
		((i >>  8) & 0x0000ff00)|
		((i >> 24) & 0x000000ff);
}

struct Label_Data {
	imgl::Blob	raw;
	/*
	[offset] [type]          [value]          [description]
	0000     32 bit integer  0x00000801(2049) magic number (MSB first)
	0004     32 bit integer  60000            number of items
	0008     unsigned byte   ??               label
	0009     unsigned byte   ??               label
	........
	xxxx     unsigned byte   ??               label
	*/
	struct Header {
		s32	magic_number; // 0x00000801
		s32 items_count;
		// u8 labels[items_count];
	};

	Header*	header;
	u8*		labels;
};
bool load_label_data (cstr filepath, Label_Data* data) {
	if (!imgl::load_binary_file(filepath, &data->raw))
		return false;

	data->header = (Label_Data::Header*)data->raw.data;
	data->labels = (u8*)(data->header +1);

	data->header->magic_number = bswap(data->header->magic_number);
	data->header->items_count = bswap(data->header->items_count);

	if (data->header->magic_number != 0x00000801)
		return false;

	return true;
}

struct Image_Data {
	imgl::Blob	raw;
	/*
	[offset] [type]          [value]          [description]
	0000     32 bit integer  0x00000803(2051) magic number
	0004     32 bit integer  60000            number of images
	0008     32 bit integer  28               number of rows
	0012     32 bit integer  28               number of columns
	0016     unsigned byte   ??               pixel
	0017     unsigned byte   ??               pixel
	........
	xxxx     unsigned byte   ??               pixel
	*/
	struct Header {
		s32	magic_number; // 0x00000803
		s32 img_count;
		iv2 img_size;
		// u8 pixels[img_count * img_size.y * img_size.x];
	};

	Header*	header;
	u8*		pixels;

	u8* get_pixels (int img_indx) {
		return &pixels[img_indx * (header->img_size.x * header->img_size.y)];
	}
};
bool load_image_data (cstr filepath, Image_Data* data) {
	if (!imgl::load_binary_file(filepath, &data->raw))
		return false;

	data->header = (Image_Data::Header*)data->raw.data;
	data->pixels = (u8*)(data->header +1);

	data->header->magic_number = bswap(data->header->magic_number);
	data->header->img_count = bswap(data->header->img_count);
	data->header->img_size.x = bswap(data->header->img_size.x);
	data->header->img_size.y = bswap(data->header->img_size.y);

	if (data->header->magic_number != 0x00000803)
		return false;

	return true;
}

Label_Data training_data_label;
Image_Data training_data_images;

std::vector<int>	random_input_buffer; // instead of randomly shuffling the input data and then splitting into batches O(n)  get a random index and then swap that array member with one at the end and shrink the range we are generating a random index out of, so that the one we just picked cannot be picked again in the current epoch O(1), this changes the order of the array, but we dont care, since we only want random subsets of it anyway
int					epoch_inputs_processed;
int					epoch;

int					correct_evals_in_last_epoch;
int					correct_evals_in_epoch;
std::vector<int>	correct_evals_in_epochs;

int		minibatch_size = 50;

void reset_epoch (int epoch_of_loaded_nn=0) {
	epoch = epoch_of_loaded_nn;
	epoch_inputs_processed = 0;
	correct_evals_in_last_epoch = 0;
	correct_evals_in_epoch = 0;
	correct_evals_in_epochs.assign(epoch_of_loaded_nn, 0);
}
void epoch_finished () {
	epoch++;
	epoch_inputs_processed = 0;
	correct_evals_in_last_epoch = correct_evals_in_epoch;
	correct_evals_in_epochs.push_back(correct_evals_in_epoch);
	correct_evals_in_epoch = 0;
}

std::vector<bool>	inputs_correct;
int					correct_inputs;
bool				correct_inputs_changed = true;

bool load_training_data (cstr labels, cstr images) {
	if (!load_label_data(labels, &training_data_label)) {
		return false;
	}

	if (!load_image_data(images, &training_data_images)) {
		return false;
	}

	random_input_buffer.resize(training_data_label.header->items_count);

	for (int i=0; i<training_data_label.header->items_count; ++i) {
		random_input_buffer[i] = i;
	}

	inputs_correct.assign(training_data_label.header->items_count, false);
	correct_inputs = false;
	correct_inputs_changed = true;

	return true;
}

#include <random>

std::default_random_engine			random_engine;

int get_random_input_index_unique_in_epoch () {
	if (epoch_inputs_processed == (int)random_input_buffer.size()) {
		epoch_finished();
	}

	int epoch_inputs_remaining = (int)random_input_buffer.size() -epoch_inputs_processed;

	std::uniform_int_distribution<int>	distribution (0, epoch_inputs_remaining -1);

	int random_indx = distribution(random_engine);

	int random_input_index = random_input_buffer[random_indx];

	std::swap(random_input_buffer[random_indx], random_input_buffer[epoch_inputs_remaining -1]); // this input cannot be picked again in the next interation, since we will not include it in the next random index generarion, this changes the order of the random_input_buffer array, but we dont care, since we only want random subsets of it

	epoch_inputs_processed++;

	return random_input_index;
}

#define DBG_USE_STD_VECTOR 1

class Vector {

	#if DBG_USE_STD_VECTOR
	std::vector<f32>	_data;
	#else
	unique_ptr<f32[]>	_data;
	#endif
	int					_sz;

public:

	static Vector alloc (int len) {
		Vector vec;
		vec._sz = len;
		#if DBG_USE_STD_VECTOR
		vec._data.resize(len);
		#else
		vec._data = make_unique<f32[]>(len);
		#endif
		return vec;
	}

	int length () const {	return _sz; }

	f32* data () { return &_data[0]; }
	int size_bytes () { return _sz * sizeof(f32); }

	void clear (f32 val) {
		for (int i=0; i<_sz; ++i)
			_data[i] = val;
	}

	inline f32 const&	at (int indx) const {
		assert(indx >= 0 && indx < _sz);
		return _data[indx];
	}
	inline f32&			at (int indx) {
		assert(indx >= 0 && indx < _sz);
		return  _data[indx];
	}

};

enum e_matrix_layout { ROW_MAJOR=0, COLUMN_MAJOR };

template <e_matrix_layout LAYOUT>
class _Matrix {

	#if DBG_USE_STD_VECTOR
	std::vector<f32>	_data;
	#else
	unique_ptr<f32[]>	_data;
	#endif
	iv2					_sz;

public:

	static _Matrix alloc (int x, int y) {
		_Matrix mat;
		mat._sz = iv2(x,y);
		#if DBG_USE_STD_VECTOR
		mat._data.resize(mat._sz.x * mat._sz.y);
		#else
		mat._data = make_unique<f32[]>( mat._sz.x * mat._sz.y );
		#endif
		return mat;
	}
	static _Matrix alloc (iv2 size) {	return alloc(size.x,size.y); }

	iv2 size () const {		return _sz; }
	int length () const {	return _sz.x * _sz.y; }

	f32* data () { return &_data[0]; }
	int size_bytes () { return length() * (int)sizeof(f32); }

	void clear (f32 val) {
		for (int i=0; i<(_sz.y * _sz.x); ++i)
			_data[i] = val;
	}

	inline f32 const&	at (int x, int y) const {
		assert(all(iv2(x,y) >= 0 && iv2(x,y) < _sz));
		return LAYOUT == ROW_MAJOR ? _data[y * _sz.x + x] : _data[x * _sz.y + y];
	}
	inline f32&			at (int x, int y) {
		assert(all(iv2(x,y) >= 0 && iv2(x,y) < _sz));
		return LAYOUT == ROW_MAJOR ? _data[y * _sz.x + x] : _data[x * _sz.y + y];
	}

	inline f32 const&	at (iv2 pos) const {	return at(pos.x,pos.y); }
	inline f32&			at (iv2 pos) {			return at(pos.x,pos.y); }

};

typedef _Matrix<ROW_MAJOR> Matrix;

// The neural network data needed to process input into output (all neurons with weights and biases), this is what we are training
struct Layer {
	int			neurons_count;
	int			weights_count;

	Matrix				weights;		// x: weights	y: neurons
	Vector				biases;			//				   neurons

	//
	Vector				inp_activations;

	Vector				inp_z; // activation before applying activation func

	Vector				delta;

	//
	Matrix				weights_gradient;
	Vector				biases_gradient;
};

// input is identified via a index
int								input_indx;

Vector							input_buffer;
u8								inp_label;

void get_input (int indx) {
	input_indx = indx;

	inp_label = training_data_label.labels[input_indx];

	int input_length = training_data_images.header->img_size.x * training_data_images.header->img_size.y;

	u8* pixels = &training_data_images.pixels[ input_indx * input_length ];

	for (int i=0; i<input_length; ++i) {
		input_buffer.at(i) = (f32)pixels[i] / 255;
	}
}
void get_random_input_from_minibatch () {
	get_input( get_random_input_index_unique_in_epoch() );
}

void input_classified (int label) {

	bool correct = label == inp_label;

	bool was_correct = inputs_correct[input_indx];
	if (was_correct != correct) {

		correct_inputs += correct ? +1 : -1;

		inputs_correct[input_indx] = correct;

		correct_inputs_changed = true;
	}

	if (correct)
		correct_evals_in_epoch++;
}

//
std::vector<Layer>				layers; // hidden layers + output layer (the first layer actually represents the input and has no weights and biases, just the activations)

f32 expected_output (int output_neuron_indx, u8 input_label) {
	return output_neuron_indx == (int)input_label ? 1.0f : 0.0f;
}

//
void build_nn (iv2 input_size, std::vector<int> const& layers_neuron_count) {

	int input_length = input_size.x * input_size.y;

	input_buffer = Vector::alloc(input_length);

	int prev_layer_neuron_count = input_length;

	{
		int layers_count = (int)layers_neuron_count.size();

		layers.resize(layers_count);

		for (int i=0; i<layers_count; ++i) {
			auto& l = layers[i];

			l.neurons_count = layers_neuron_count[i];
			l.weights_count = prev_layer_neuron_count;

			prev_layer_neuron_count = l.neurons_count;
		}
	}

	for (auto& l : layers) {
		iv2 weights_size = iv2(l.weights_count, l.neurons_count);

		l.weights =					Matrix::alloc(weights_size);
		l.biases =					Vector::alloc(l.neurons_count);

		l.inp_activations =			Vector::alloc(l.neurons_count);

		l.inp_z =					Vector::alloc(l.neurons_count);
		l.delta =					Vector::alloc(l.neurons_count);

		l.weights_gradient =		Matrix::alloc(weights_size);
		l.biases_gradient =			Vector::alloc(l.neurons_count);
	}
}

//
f32 mean = 1 ? 0 : 0.5f;
f32 deviation = 1;

std::normal_distribution<f32>	normal_distribution (mean, deviation);

enum init_mode {
	RIM_NAIVE=0,
	RIM_XAVIER,
};
cstr init_mode_str[] = {
	"NAIVE",
	"XAVIER",
};

f32 random_value (init_mode mode, int neuron_count) {
	switch (mode) {
		case RIM_NAIVE:		return normal_distribution(random_engine);

		default:
		case RIM_XAVIER:	return sqrt(1.0f / neuron_count) * normal_distribution(random_engine);
	}
}

void init_nn_with_random_weights (init_mode mode) {
	for (auto& l : layers) {
		for (int n=0; n<l.neurons_count; ++n) {
			for (int w=0; w<l.weights_count; ++w) {
				l.weights.at(w,n) = random_value(mode, l.neurons_count);
			}
		}
		for (int n=0; n<l.neurons_count; ++n) {
			l.biases.at(n) = random_value(mode, l.neurons_count);
		}
	}
}

//
f32 sigmoid (f32 x) {
	//		1   / (e^(-x) +1)
	// OR:	e^x / (e^x    +1)

	f32 tmp = exp(x);

	return tmp / (tmp +1);
}
f32 ReLU (f32 x) {
	return max(x, 0.0f);
}

f32 activation_func (f32 x) {
	return sigmoid(x);
	//return ReLU(x);
}

f32 activation_func_derivative (f32 x) {
	return sigmoid(x) * (1 -sigmoid(x));
	//return x >= 0 ? 1 : 0;
}

//
int classify (Vector const& output_layer_activations) {
	f32 max_activation = -INF;
	int max_activation_indx = -1;

	for (int n=0; n<output_layer_activations.length(); ++n) {
		f32 activation = output_layer_activations.at(n);

		if (activation > max_activation) {
			max_activation = activation;
			max_activation_indx = n;
		}
	}

	return max_activation_indx;
}

void eval_nn () {

	for (int layer_i=0; layer_i<(int)layers.size(); ++layer_i) {
		auto& l = layers[layer_i];

		auto& prev_layer_activations = layer_i == 0 ? input_buffer : layers[layer_i -1].inp_activations;

		for (int n=0; n<l.biases.length(); ++n) {

			{ // eval neuron
				f32 total = 0;
				for (auto w=0; w<prev_layer_activations.length(); ++w)
					total += l.weights.at(w,n) * prev_layer_activations.at(w);

				total += l.biases.at(n);

				l.inp_z.at(n) = total;
				l.inp_activations.at(n) = activation_func(total);
			}
		}
	}

	input_classified( classify(layers.back().inp_activations) );

}

//
void calc_gradient_via_backprop (f32 avg_factor) {

	auto& output_layer = layers.back();

	for (int n=0; n<output_layer.neurons_count; ++n) {
		f32 diff = output_layer.inp_activations.at(n) - expected_output(n, inp_label); // cost derivative

		output_layer.delta.at(n) = diff * activation_func_derivative( output_layer.inp_z.at(n) );

		auto* prev_layer = layers.size() > 1 ? &layers[layers.size() -2] : nullptr;
		auto* prev_layer_activations = prev_layer ? &prev_layer->inp_activations : &input_buffer;

		output_layer.biases_gradient.at(n) += output_layer.delta.at(n) *											avg_factor;

		for (int w=0; w<output_layer.weights_count; ++w) {
			output_layer.weights_gradient.at(w,n) += output_layer.delta.at(n) * prev_layer_activations->at(w) *		avg_factor;
		}

	}

	for (int i=(int)layers.size() -2; i>=0; --i) {
		auto& l = layers[i];

		auto* prev_layer = i > 0 ? &layers[i -1] : nullptr;
		auto* prev_layer_activations = prev_layer ? &prev_layer->inp_activations : &input_buffer;

		auto& next_layer = layers[i +1];

		for (int n=0; n<l.neurons_count; ++n) {

			f32 delta = 0;
			for (int j=0; j<next_layer.neurons_count; ++j) {
				delta += next_layer.weights.at(n,j) * next_layer.delta.at(j);
			}

			delta *= activation_func_derivative( l.inp_z.at(n) );

			l.delta.at(n) = delta;

			l.biases_gradient.at(n) +=				delta *										avg_factor;

			for (int w=0; w<l.weights_count; ++w) {
				l.weights_gradient.at(w,n) +=		delta * prev_layer_activations->at(w) *		avg_factor;
			}
		}
	}
}

f32 learning_rate = 0.1f;

void calc_gradient () {

	{
		for (auto& l : layers) {
			l.inp_activations.clear(0);
			l.biases_gradient.clear(0);
			l.weights_gradient.clear(0);
		}
	}

	for (int i=0; i<minibatch_size; ++i) {

		get_random_input_from_minibatch();

		eval_nn();

		calc_gradient_via_backprop(1.0f / (f32)minibatch_size);
	}
}

void apply_gradient () {
	for (int i=0; i<(int)layers.size(); ++i) {
		auto& l = layers[i];

		for (int n=0; n<l.neurons_count; ++n) {
			for (int w=0; w<l.weights_count; ++w) {
				l.weights.at(w,n) -= l.weights_gradient.at(w,n) * learning_rate;
			}
		}
		for (int n=0; n<l.neurons_count; ++n) {
			l.biases.at(n) -= l.biases_gradient.at(n) * learning_rate;
		}
	}
}

//
f32 color_visualize_range = 1;
f32 color_visualize_power_scale = 1;

f32 power_scale (f32 x, f32 power) {
	return normalize(x) * pow(length(x), power);
}

rgbf weight_col_gradient (f32 t) {

	f32 highlight = abs(t * 2 -1);

	rgbf hsl;
	hsl.x = lerp(45, 75 -360, t) / 360; // map val from [-inf,+inf] -> [orange, (red,magenta,blue,cyan,green), green-yellow]
	hsl.y = lerp(   0,   1, pow(highlight, 1.0f/4));
	hsl.z = lerp(0.5f,0.8f, pow(highlight, 1.0f/4));

	return imgl::hsl_to_rgb(hsl);
}

rgbf get_weight_col (f32 val, bool zero_one_range=false) {

	f32 t = zero_one_range ? lerp(0.5f,1,val) : sigmoid( power_scale(val / color_visualize_range, color_visualize_power_scale) );

	return weight_col_gradient(t);
}
ImVec4 get_weight_col_im (f32 val, bool zero_one_range=false) {
	auto c = get_weight_col(val, zero_one_range);
	return ImVec4(c.x,c.y,c.z,1);
}

//
void imgui_visualize_as_image (f32 const* values, iv2 size, v2 display_size, bool zero_one_range=false) {
	auto pixels = make_unique<rgbf[]>( size.x * size.y );

	for (int i=0; i<(size.x * size.y); ++i) {
		pixels[i] = imgl::to_linear(get_weight_col(values[i], zero_one_range));
	}

	imgui_stream_texture(pixels.get(), size, display_size, true);
}

template <typename F, typename IT>
static void display_as_grid (f32 element_w, F draw_element, IT it, int element_count) {
	int columns = max((int)floor(ImGui::GetContentRegionAvailWidth() / element_w), 1);
	int rows = (element_count +(columns -1)) / columns; // also count partial filled row

	ImGuiListClipper clipper (rows);
	while (clipper.Step()) {
		for (int j=clipper.DisplayStart; j<clipper.DisplayEnd; j++) {
			for (int i=0; i<columns; ++i) {
				int elem_indx = j * columns +i;

				if (elem_indx == element_count)
					break; // end of last (partial filled row) reached

				if (i > 0) ImGui::SameLine();

				draw_element(*(it +elem_indx));
			}
		}
	}
}

struct Vector_Value {
	Vector const&	v;
	cstr			name;
};
void imgui_show_values (Matrix const& m, cstr m_name, std::vector<Vector_Value> const& vs, bool show_in_input_image_form=false, bool zero_one_range=false) {
	int neurons_count = m.size().y;
	int weights_count = m.size().x;
	for (auto& v : vs) assert(neurons_count == v.v.length());

	if (show_in_input_image_form) {

		v2 img_size = 100;

		{
			ImGui::BeginGroup();

			ImGui::Text("");

			for (auto& v : vs)
				ImGui::Text("%s: ", v.name);

			ImGui::Text("%s:", m_name);

			ImGui::EndGroup();
		}
		for (int n=0; n<neurons_count; ++n) {
			ImGui::SameLine();

			ImGui::BeginGroup();

			ImGui::Text("%d", n);

			for (auto& v : vs)
				ImGui::TextColored(	get_weight_col_im(v.v.at(n), zero_one_range), "% .4f", v.v.at(n));

			imgui_visualize_as_image(&m.at(0,n), training_data_images.header->img_size, img_size, zero_one_range);

			ImGui::EndGroup();
		}

		for (int n=0; n<neurons_count; ++n) {
			if (!ImGui::TreeNodeEx(imgl::prints("Neuron %d", n).c_str(), ImGuiTreeNodeFlags_DefaultOpen))
				continue;

			for (auto& v : vs) {
				ImGui::Text("%s: ", v.name);
				ImGui::SameLine();
				ImGui::TextColored(	get_weight_col_im(v.v.at(n), zero_one_range), "% .4f", v.v.at(n));
			}

			ImGui::Text("%s:", m_name);

			for (int y=0; y<training_data_images.header->img_size.y; ++y) {
				for (int x=0; x<training_data_images.header->img_size.x; ++x) {
					if (x > 0) ImGui::SameLine();

					int pos = y * training_data_images.header->img_size.x + x;

					ImGui::TextColored(get_weight_col_im(m.at(pos,n), zero_one_range), "% .4f", m.at(pos,n));
				}
			}

			ImGui::TreePop();
		}

	} else {

		for (int i=0; i<(int)vs.size(); ++i) {
			auto& v = vs[i];

			if (i > 0) ImGui::SameLine();

			ImGui::BeginGroup();
			ImGui::Text(v.name);

			for (int n=0; n<neurons_count; ++n) {
				ImGui::TextColored(	get_weight_col_im(v.v.at(n), zero_one_range), "% .4f  ", v.v.at(n));
			}
			ImGui::EndGroup();
		}

		auto sz = ImGui::GetItemRectSize();

		ImGui::SameLine();
		ImGui::BeginGroup();
		ImGui::BeginChild(m_name, ImVec2(0, sz.y +20), false, ImGuiWindowFlags_HorizontalScrollbar);

		ImGui::Text("%s", m_name);

		for (int n=0; n<neurons_count; ++n) {
			for (int w=0; w<weights_count; ++w) {
				if (w > 0) ImGui::SameLine();

				ImGui::TextColored(get_weight_col_im(m.at(w,n), zero_one_range), "% .4f", m.at(w,n));
			}
		}
		ImGui::EndChild();
		ImGui::EndGroup();

	}
}

void imgui_show_values (Vector const& v, bool zero_one_range=false) {
	int neurons_count = v.length();

	for (int n=0; n<neurons_count; ++n) {
		ImGui::TextColored(get_weight_col_im(v.at(n), zero_one_range), "% .4f", v.at(n));
	}
}

void imgui_show_weights (bool* wnd_open) {
	ImGui::Begin("weights", wnd_open);

	for (int i=0; i<(int)layers.size(); ++i) {
		auto& l = layers[i];

		if (!ImGui::TreeNodeEx(imgl::prints("Layer %d", i +1).c_str(), ImGuiTreeNodeFlags_DefaultOpen))
			continue;

		imgui_show_values(l.weights, "weights", {{l.biases, "bias"}}, i == 0);

		ImGui::TreePop();
	}

	ImGui::End();
}
void imgui_show_weights_gradient (bool* wnd_open) {
	ImGui::Begin("weights_gradient", wnd_open);

	for (int i=0; i<(int)layers.size(); ++i) {
		auto& l = layers[i];

		if (!ImGui::TreeNodeEx(imgl::prints("Layer %d", i +1).c_str(), ImGuiTreeNodeFlags_DefaultOpen))
			continue;

		imgui_show_values(l.weights_gradient, "weights", {{l.biases_gradient, "bias"}}, i == 0);

		ImGui::TreePop();
	}

	ImGui::End();
}
void imgui_show_input_eval (bool* wnd_open) {
	ImGui::Begin("input eval", wnd_open);

	static int input_indx = 0;
	ImGui::InputInt("input_indx", &input_indx);

	get_input(input_indx);

	eval_nn();

	imgui_visualize_as_image(input_buffer.data(), training_data_images.header->img_size, 100, true);

	auto* prev_activation = &input_buffer;

	for (int i=0; i<(int)layers.size(); ++i) {
		auto& l = layers[i];

		auto weighted = Matrix::alloc(iv2(l.weights_count,l.neurons_count));
		for (int n=0; n<l.neurons_count; ++n) {
			for (int w=0; w<l.weights_count; ++w) {
				weighted.at(w,n) = l.weights.at(w,n) * prev_activation->at(w);
			}
		}
		prev_activation = &l.inp_activations;

		if (!ImGui::TreeNodeEx(imgl::prints("Layer %d", i +1).c_str(), ImGuiTreeNodeFlags_DefaultOpen))
			continue;

		imgui_show_values(weighted, "weighted", {{l.inp_activations, "activ"}}, i == 0, true);

		ImGui::TreePop();
	}

	ImGui::End();
}

void imgui_nn_debug_view () {

	//ImGui::ShowDemoWindow();

	static bool weights_open = true;
	static bool gradient_open = false;
	static bool eval_open = false;
	static bool correct_inputs_open = false;
	static bool incorrect_inputs_open = false;

	ImGui::Begin("Control");
	ImGui::Checkbox("Show Weights", &weights_open);
	ImGui::Checkbox("Show Weights Gradient", &gradient_open);
	ImGui::Checkbox("Show Input Evaluation", &eval_open);

	ImGui::Checkbox("Show Correct Inputs", &correct_inputs_open);
	ImGui::SameLine();
	ImGui::Checkbox("Show Wrong Inputs", &incorrect_inputs_open);
	ImGui::End();

	if (weights_open)		imgui_show_weights(&weights_open);
	if (gradient_open)		imgui_show_weights_gradient(&gradient_open);
	if (eval_open)			imgui_show_input_eval(&eval_open);

	{
		ImGui::Begin("Learning");

		ImGui::Value("epoch", epoch);
		ImGui::Text("last epoch error: %7d / %7d -> %7.3f %%", (int)random_input_buffer.size() -correct_evals_in_last_epoch, (int)random_input_buffer.size(),	(f32)((int)random_input_buffer.size() -correct_evals_in_last_epoch) / (f32)(int)random_input_buffer.size() * 100);
		ImGui::Text("this_epoch error: %7d / %7d -> %7.3f %%", epoch_inputs_processed -correct_evals_in_epoch, epoch_inputs_processed,							(f32)(epoch_inputs_processed -correct_evals_in_epoch) / (f32)epoch_inputs_processed * 100);

		{
			assert((int)correct_evals_in_epochs.size() == epoch);

			auto get = [] (void* data, int indx) -> f32 {
				if (!(indx >= 0 && indx < (epoch +1)))
					return 0;

				f32 error_rate;
				if (indx == epoch)
					error_rate = (f32)(epoch_inputs_processed -correct_evals_in_epoch) / (f32)epoch_inputs_processed; // show error rate in current (unfinished) epoch
				else
					error_rate = (f32)((int)random_input_buffer.size() -correct_evals_in_epochs[indx]) / (f32)(int)random_input_buffer.size();

				return error_rate * 100;
			};

			static int diagram_height = 100;
			static f32 y_height = 100;

			ImGui::PlotHistogram("##epochs_error_percent", get, nullptr, max(epoch +1, 5), 0, "epochs_error_percent", 0,y_height, ImVec2(ImGui::GetContentRegionAvailWidth(), (f32)diagram_height));

			if (ImGui::BeginPopupContextItem("epochs_error_percent options")) {

				ImGui::DragInt("diagram_height", &diagram_height, 0.1f);
				ImGui::DragFloat("y_height", &y_height, 0.1f);

				ImGui::EndPopup();
			}
		}

		ImGui::End();
	}

	if (correct_inputs_open || incorrect_inputs_open) {
		v2 img_size = 100;

		auto show_input = [&] (int indx) {

			ImGui::BeginGroup();

			ImGui::Text("%d (%d)", indx, (int)training_data_label.labels[indx]);

			{
				int input_length = training_data_images.header->img_size.x * training_data_images.header->img_size.y;

				u8* pixels = &training_data_images.pixels[ indx * input_length ];

				imgui_stream_texture(pixels, training_data_images.header->img_size, img_size);
			}

			ImGui::EndGroup();
		};

		static std::vector<int> inputs_correct_sorted;

		if (inputs_correct_sorted.size() != inputs_correct.size()) {
			inputs_correct_sorted.resize(inputs_correct.size());
			for (int i=0; i<inputs_correct.size(); ++i) {
				inputs_correct_sorted[i] = i;
			}
		}

		if (correct_inputs_changed) {
			std::sort(inputs_correct_sorted.begin(), inputs_correct_sorted.end(), [] (int l, int r) {
					if (		inputs_correct[l] !=        inputs_correct[r] )
						return (inputs_correct[l] ? 1:0) < (inputs_correct[r] ? 1:0);

					if (       training_data_label.labels[l] != training_data_label.labels[r] )
						return training_data_label.labels[l] <  training_data_label.labels[r];

					assert(l != r);
					return l < r;
				});
			correct_inputs_changed = false;
		}

		if (correct_inputs_open) {
			if (ImGui::Begin(imgl::prints("Correct Inputs (%6d)###Correct Inputs", correct_inputs).c_str(), &correct_inputs_open)) {
				display_as_grid(img_size.x +6, show_input, inputs_correct_sorted.end() -correct_inputs, correct_inputs);
			}
			ImGui::End();
		}

		if (incorrect_inputs_open) {
			if (ImGui::Begin(imgl::prints("Wrong Inputs (%6d)###Wrong Inputs", (int)inputs_correct.size() -correct_inputs).c_str(), &incorrect_inputs_open)) {
				display_as_grid(img_size.x +6, show_input, inputs_correct_sorted.begin(), (int)inputs_correct.size() -correct_inputs);
			}
			ImGui::End();
		}
	}
}

std::string saveload_filepath = "saves/trained/";

/*
	struct Filedata {
		char	magic_number[8] = "raznn"
		int		input_size;
		int		layers_count;

		int		completed_epochs;
		f32		last_error_rate;

		int		neuron_counts[layers_count];

		int		descr_len;
		char	descr[descr_len];

		Layer	layers[layers_count];
	}
	struct Layer {
		f32		bias[															neuron_counts[layer_i] ];
		f32		weights[ layer_i == 0 ? input_size : neuron_counts[layer_i -1], neuron_counts[layer_i] ];
	}
*/

struct File_Header {
	char	magic_number[8] = "raznn";
	int		input_size;
	int		layers_count;

	int		completed_epochs;
	f32		last_error_rate;
};
void* _push_data (std::vector<byte>* buf, int size) {
	auto old_sz = buf->size();
	buf->resize(old_sz +size);
	return &(*buf)[old_sz];
}
void push_data (std::vector<byte>* buf, void* data, int size) {
	auto* tmp = _push_data(buf, size);
	memcpy(tmp, data, size);
}
void push_string (std::vector<byte>* buf, std::string const& str) {
	auto* size = (int*)_push_data(buf, (int)(sizeof(int) +str.size() * sizeof(char)));
	auto* chars = (char*)(size +1);

	*size = (int)str.size();
	memcpy(chars, str.c_str(), *size);
}

bool read_data (byte** cur, byte* end, void* data, int size) {
	int sz_remain = (int)(end -*cur);
	if (sz_remain < size)
		return false;

	memcpy(data, *cur, size);

	*cur += size;
	return true;
}
bool read_string (byte** cur, byte* end, std::string* str) {
	int length;
	if (!read_data(cur,end, &length, sizeof(length))) return false;

	str->resize(length);

	if (!read_data(cur,end, &str[0], length)) return false;

	return true;
}

std::vector<int> layers_neuron_count = {
	16,
	16,
	10
};

bool save_nn () {

	std::vector<byte> data;

	File_Header header;

	header.input_size = training_data_images.header->img_size.x * training_data_images.header->img_size.y;
	header.layers_count = (int)layers.size();
	header.completed_epochs = epoch;
	header.last_error_rate = (f32)((int)random_input_buffer.size() -correct_evals_in_last_epoch) / (f32)(int)random_input_buffer.size();

	push_data(&data, &header, sizeof(File_Header));

	for (auto& l : layers) {
		push_data(&data, &l.neurons_count, sizeof(l.neurons_count));
	}

	push_string(&data, "");

	for (auto& l : layers) {
		push_data(&data, l.biases.data(), l.biases.size_bytes());
		push_data(&data, l.weights.data(), l.weights.size_bytes());
	}

	return imgl::write_fixed_size_binary_file(saveload_filepath.c_str(), data.data(), data.size());
}
bool load_nn () {

	imgl::Blob data;
	if (!imgl::load_binary_file(saveload_filepath.c_str(), &data))
		return false;
	byte* cur = (byte*)data.data;
	byte* end = cur +data.size;

	File_Header header;

	if (!read_data(&cur,end, &header, sizeof(File_Header))) return false;

	if (memcmp(header.magic_number, "raznn", 8) != 0) return false;

	if (header.input_size != (training_data_images.header->img_size.x * training_data_images.header->img_size.y)) return false;

	if (header.layers_count < 1) return false;
	std::vector<int>	layers_neuron_count (header.layers_count);

	reset_epoch(header.completed_epochs);

	for (auto& l : layers_neuron_count)
		read_data(&cur,end, &l, sizeof(l));

	std::string descr;
	if (!read_string(&cur,end, &descr)) return false;

	build_nn(training_data_images.header->img_size, layers_neuron_count);

	calc_gradient();

	for (int i=0; i<(int)layers_neuron_count.size(); ++i) {
		auto& l = layers[i];
		if (!read_data(&cur,end, l.biases.data(), l.biases.size_bytes())) return false;
		if (!read_data(&cur,end, l.weights.data(), l.weights.size_bytes())) return false;
	}

	return true;
}

#include "time.h"

int main () {

	imgl::begin_user_texture_callback = &begin_user_texture;

	for (;;) {
		if (!imgl::begin_window("Neural Network", 0, imgl::POLL_FOR_INPUT))
			break;
		imgl::begin_imgui();

		ImGui::Begin("Control");

		{
			auto pixels = make_unique<rgbf[]>(100);

			for (int i=0; i<100; ++i) {
				pixels[i] = imgl::to_linear(weight_col_gradient((f32)i / (100 -1)));
			}

			imgui_stream_texture(pixels.get(), iv2(100,1), iv2(0,10), true);
		}

		ImGui::DragFloat("color_visualize_range", &color_visualize_range, 0.01f);
		ImGui::DragFloat("color_visualize_power_scale", &color_visualize_power_scale, 0.01f);

		static f32 stop_cost = 0.01f;
		ImGui::DragFloat("stop_cost", &stop_cost, 0.01f, 0);

		static bool seed_with_time = 1;
		static uint seed = 0;
		ImGui::Checkbox("seed_with_time", &seed_with_time);
		ImGui::InputInt("seed", (int*)&seed);

		static init_mode random_init_mode = RIM_XAVIER;
		ImGui::Combo("random_init_mode", (int*)&random_init_mode, &init_mode_str[0], ARRLEN(init_mode_str));

		{
			static std::string labels = "training_data/train-labels.idx1-ubyte";
			static std::string images = "training_data/train-images.idx3-ubyte";

			ImGui::InputText_str("dataset labels", &labels);
			ImGui::InputText_str("dataset images", &images);

			static bool ok = true;
			static std::string error_str;

			static bool load_dataset = true;

			if (ImGui::Button("Reload Dataset") || load_dataset) {
				ok = load_training_data(labels.c_str(), images.c_str());
				if (!ok) error_str = imgl::prints("dataset load failed (\"%s\", \"%s\") failed!", labels.c_str(), images.c_str());
			}
			load_dataset = false;

			if (ok) error_str = "OK";

			ImGui::SameLine();
			ImGui::TextColored(ok ? ImVec4(0.2f,1,0.2f,1) : ImVec4(1,0.2f,0.2f,1), error_str.c_str());
		}

		iv2 input_size = training_data_images.header->img_size;
		ImGui::Value("input_size", input_size);

		{
			bool open = ImGui::TreeNode("##hidden_layers_neuron_count");

			ImGui::SameLine();

			int sz = (int)layers_neuron_count.size();
			ImGui::DragInt("layers_neuron_count[]", &sz, 0.01f);
			layers_neuron_count.resize(sz);

			if (open) {
				int i=0;
				for (auto& c : layers_neuron_count) {
					ImGui::DragInt(imgl::prints("layers_neuron_count[]##%d", i++).c_str(), &c, 0.01f);
				}
				ImGui::TreePop();
			}
		}

		static bool reset_nn = true;
		reset_nn = ImGui::Button("reset_nn") || reset_nn;
		if (reset_nn) {

			build_nn(input_size, layers_neuron_count);

			random_engine.seed( seed_with_time ? (int)(imgl::get_time() * 10000) : seed );

			init_nn_with_random_weights(random_init_mode);

			reset_epoch();

			calc_gradient();

			reset_nn = false;
		}

		{
			ImGui::InputText_str("saveload_filepath", &saveload_filepath);

			static bool ok = true;
			static std::string error_str;

			if (ImGui::Button("save")) {
				ok = save_nn();
				if (!ok) error_str = imgl::prints("save to file \"%s\" failed!", saveload_filepath.c_str());
			}

			ImGui::SameLine();
			if (ImGui::Button("load")) {
				ok = load_nn();
				if (!ok)
					error_str = imgl::prints("load from file \"%s\" failed!", saveload_filepath.c_str());
			}

			if (ok) error_str = "OK";

			ImGui::SameLine();
			ImGui::TextColored(ok ? ImVec4(0.2f,1,0.2f,1) : ImVec4(1,0.2f,0.2f,1), error_str.c_str());
		}

		ImGui::DragFloat("learning_rate", &learning_rate, 0.05f, 0);
		ImGui::DragInt("batch_size", &minibatch_size, 1, 1);

		auto step = [&] () {

			apply_gradient();
			calc_gradient();

		};

		{ // update screem at vsync frequency, but do nn train steps at other frequency

			static bool manaul_stepping = true;
			ImGui::Checkbox("manaul_stepping", &manaul_stepping);

			ImGui::SameLine();
			bool manaul_step = ImGui::Button("step");

			if (manaul_stepping) {
				if (manaul_step)
					step();
			} else {
				f64 steps_begin = imgl::get_time();

				for (int i=0;; ++i) {
					step();

					f32 steps_epased_t = (f32)(imgl::get_time() -steps_begin);
					if (steps_epased_t >= 0.014f) // don't stall rendering
						break;
				}
			}
		}

		ImGui::End();

		imgui_nn_debug_view();

		imgl::end_imgui();
		end_stream_textures();
		imgl::end_window();
	}

	return 0;
}