felipecruz/linear_regression_hr.cpp

## linear_regression_hr.cpp
/* Author: felipe cruz felipecruz@gmail.com
*/

#include <iostream>
#include <cmath>
#include <cstdio>
#include <vector>
#include <algorithm>
#include <map>
#include <random>

#include <assert.h>
#include <stdio.h>

using namespace std;

double normalize(double val, double min, double max) {
    return (val - min) / (max - min);
}

class Uniform {
    std::default_random_engine generator;
    std::uniform_real_distribution<double> distribution;

    public:

    Uniform(double min, double max) {
        distribution = uniform_real_distribution<double>(min, max);
    }

    double sample() {
        return distribution(generator);
    }
};


void print(double *w, int size) {
    cout << "[" << w[0];

    for (int i = 0; i < size; i++) {
        cout << ", " << w[i];
    }

    cout << "]" << endl;
}


class LinearRegression {
    double *weights;
    int size;
    bool free;
    bool _debug;

public:
    LinearRegression(int size, bool debug = false) : size(size) {
        weights = new double[size];
        Uniform uniform = Uniform(0., 1.);
        for (int i = 0; i < size; i++) {
            weights[i] = uniform.sample();
        }
        free = true;
        _debug = debug;
    }

    LinearRegression(double *_weights, int size, bool debug = false) : size(size) {
        weights = _weights;
        free = false;
        _debug = debug;
    }

    double dot(double *in, double *w) {
        double total = 0.0;
        for (int i = 0; i < size; i++) {
            total += in[i] * w[i];
        }
        return total;
    }

    void update(double *gradient) {
        double learning_rate = 0.011;
        for (int i = 0; i < size; i++) {
            weights[i] += -(learning_rate * gradient[i]);
        }
    }

    void train(int epochs, double **dataset, int length, double *output) {
        for (int i = 0; i < epochs; i++) {
            if (_debug) {
                cout << "Epoch: " << i << " ";
            }
            learn(dataset, length, output);
        }
    }

    void predict(double **dataset, int length) {
        double *predictions = new double[length];

        for (int i = 0; i < length; i++) {
            predictions[i] = dot(dataset[i], weights);
        }

        double *min = std::min_element(predictions, predictions + length);
        double *max = std::max_element(predictions, predictions + length);

        for (int i = 0; i < length; i++) {
            int pred = (int) (predictions[i]);

            if (pred > 8) {
                pred = 8;
                cout << pred << endl;
            } else if (pred >= 1 && pred <= 8) {
                cout << pred << endl;
            } else {
                cout << 1 << endl;
            }
        }

        delete predictions;
    }

    void debug() {
        print(weights, size);
    }

    void learn(double **dataset, int length, double *output) {
        double error = 0.0;
        double *gradient = new double[size];
        for (int i = 0; i < size; i++) {
            gradient[i] = 0.0;
        }

        double l2_norm = sqrt(dot(weights, weights));
        double error_count = 0;
        double correct_count = 0;

        for (int i = 0; i < length; i++) {
            double prediction = dot(dataset[i], weights);
            double single_error = output[i] - prediction;

            if (abs(floor(single_error)) == 0 || abs(floor(single_error)) == 1) {
                correct_count++;
            } else {
                error_count++;
            }

            error += single_error*single_error;
            for (int j = 0; j < size; j++) {
                gradient[j] += (-2./length) * (dataset[i][j] * single_error);
            }
        }
        update(gradient);
        delete gradient;

        error *= 1./length;
        error = sqrt(error);

        if (_debug) {
            cout << "Error: " << error << " total: " << error_count
                 << " -  " << 100. * ((correct_count - error_count) / length) << endl;
        }
    }

    ~LinearRegression() {
        if (free)
            delete weights;
    }

};

vector<string> schema = vector<string>({
    "Accountancy",
    "Biology",
    "BusinessStudies",
    "Chemistry",
    "ComputerScience",
    "Economics",
    "English",
    "Hindi",
    "Physics",
    "PhysicalEducation"
});

const string label("Mathematics");

double* from_json_to_input(map<string, double>& grades) {
    double *wgrades = new double[11];
    double total = 0.0;
    int ngrades = 0;

    for (int i = 0; i < schema.size(); i++) {
        if (grades.count(schema[i]) == 1) {
            wgrades[i] = grades[schema[i]];
            total += wgrades[i];
            ngrades++;
        } else {
            wgrades[i] = 0.0;
        }
    }

    wgrades[10] = 0.;

    return wgrades;
}

map<string, double> parse(string json) {
    map<string, double> json_obj = map<string, double>();
    size_t total_left_brackets = count(json.begin(), json.end(), '{');
    size_t total_right_brackets = count(json.begin(), json.end(), '}');

    assert(total_left_brackets == total_right_brackets);

    size_t found = json.find("\"");
    while (found != string::npos) {
        size_t end = json.find("\"", found+1);
        string key = json.substr(found + 1, end - found -1);

        size_t val_begin = json.find(":", end+1);
        size_t val_end = json.find("\"", val_begin+1);

        if (val_end == string::npos) {
            val_end = json.find("}", val_begin+1);
        }

        string value = json.substr(val_begin + 1, val_end - val_begin - 2);
        json_obj[key] = atof(value.c_str());

        found = json.find("\"", end + 2);
    }

    return json_obj;
}

void from_file(const char* filename, double **dataset, double *output) {
    int l = 0;
    int nread = 0;
    char *buff = new char[1024];
    size_t len;
    FILE *fp = fopen(filename, "r");

    while ((nread = getline(&buff, &len, fp)) != -1) {
        if (l == 0) { l++; continue; }

        map<string, double> json = parse(string(buff));
        dataset[l-1] = from_json_to_input(json);
        output[l-1] = json["Mathematics"];

        l++;
    }

    delete buff;
    fclose(fp);
}

int main() {
    int t;
    cin >> t;

    double **dataset = new double*[t];
    double **train = new double*[79465];
    double *output = new double[79465];
    from_file("training.json", train, output);

    LinearRegression regressor = LinearRegression(11, false);
    regressor.train(280, train, t, output);

    for (int i = 0; i < t; i++) {
        string data;
        cin >> data;
        map<string, double> json = parse(data);
        dataset[i] = from_json_to_input(json);
    }

    regressor.predict(dataset, t);

    for(int i = 0; i < t; ++i) {
        delete [] dataset[i];
        delete [] train[i];
    }
    delete [] dataset;
    delete [] train;

    return 0;
}
	/* Author: felipe cruz felipecruz@gmail.com
	*/

	#include <iostream>
	#include <cmath>
	#include <cstdio>
	#include <vector>
	#include <algorithm>
	#include <map>
	#include <random>

	#include <assert.h>
	#include <stdio.h>

	using namespace std;

	double normalize(double val, double min, double max) {
	return (val - min) / (max - min);
	}

	class Uniform {
	std::default_random_engine generator;
	std::uniform_real_distribution<double> distribution;

	public:

	Uniform(double min, double max) {
	distribution = uniform_real_distribution<double>(min, max);
	}

	double sample() {
	return distribution(generator);
	}
	};


	void print(double *w, int size) {
	cout << "[" << w[0];

	for (int i = 0; i < size; i++) {
	cout << ", " << w[i];
	}

	cout << "]" << endl;
	}


	class LinearRegression {
	double *weights;
	int size;
	bool free;
	bool _debug;

	public:
	LinearRegression(int size, bool debug = false) : size(size) {
	weights = new double[size];
	Uniform uniform = Uniform(0., 1.);
	for (int i = 0; i < size; i++) {
	weights[i] = uniform.sample();
	}
	free = true;
	_debug = debug;
	}

	LinearRegression(double *_weights, int size, bool debug = false) : size(size) {
	weights = _weights;
	free = false;
	_debug = debug;
	}

	double dot(double in, double w) {
	double total = 0.0;
	for (int i = 0; i < size; i++) {
	total += in[i] * w[i];
	}
	return total;
	}

	void update(double *gradient) {
	double learning_rate = 0.011;
	for (int i = 0; i < size; i++) {
	weights[i] += -(learning_rate * gradient[i]);
	}
	}

	void train(int epochs, double *dataset, int length, double output) {
	for (int i = 0; i < epochs; i++) {
	if (_debug) {
	cout << "Epoch: " << i << " ";
	}
	learn(dataset, length, output);
	}
	}

	void predict(double **dataset, int length) {
	double *predictions = new double[length];

	for (int i = 0; i < length; i++) {
	predictions[i] = dot(dataset[i], weights);
	}

	double *min = std::min_element(predictions, predictions + length);
	double *max = std::max_element(predictions, predictions + length);

	for (int i = 0; i < length; i++) {
	int pred = (int) (predictions[i]);

	if (pred > 8) {
	pred = 8;
	cout << pred << endl;
	} else if (pred >= 1 && pred <= 8) {
	cout << pred << endl;
	} else {
	cout << 1 << endl;
	}
	}

	delete predictions;
	}

	void debug() {
	print(weights, size);
	}

	void learn(double *dataset, int length, double output) {
	double error = 0.0;
	double *gradient = new double[size];
	for (int i = 0; i < size; i++) {
	gradient[i] = 0.0;
	}

	double l2_norm = sqrt(dot(weights, weights));
	double error_count = 0;
	double correct_count = 0;

	for (int i = 0; i < length; i++) {
	double prediction = dot(dataset[i], weights);
	double single_error = output[i] - prediction;

	if (abs(floor(single_error)) == 0 \|\| abs(floor(single_error)) == 1) {
	correct_count++;
	} else {
	error_count++;
	}

	error += single_error*single_error;
	for (int j = 0; j < size; j++) {
	gradient[j] += (-2./length) * (dataset[i][j] * single_error);
	}
	}
	update(gradient);
	delete gradient;

	error *= 1./length;
	error = sqrt(error);

	if (_debug) {
	cout << "Error: " << error << " total: " << error_count
	<< " - " << 100. * ((correct_count - error_count) / length) << endl;
	}
	}

	~LinearRegression() {
	if (free)
	delete weights;
	}

	};

	vector<string> schema = vector<string>({
	"Accountancy",
	"Biology",
	"BusinessStudies",
	"Chemistry",
	"ComputerScience",
	"Economics",
	"English",
	"Hindi",
	"Physics",
	"PhysicalEducation"
	});

	const string label("Mathematics");

	double* from_json_to_input(map<string, double>& grades) {
	double *wgrades = new double[11];
	double total = 0.0;
	int ngrades = 0;

	for (int i = 0; i < schema.size(); i++) {
	if (grades.count(schema[i]) == 1) {
	wgrades[i] = grades[schema[i]];
	total += wgrades[i];
	ngrades++;
	} else {
	wgrades[i] = 0.0;
	}
	}

	wgrades[10] = 0.;

	return wgrades;
	}

	map<string, double> parse(string json) {
	map<string, double> json_obj = map<string, double>();
	size_t total_left_brackets = count(json.begin(), json.end(), '{');
	size_t total_right_brackets = count(json.begin(), json.end(), '}');

	assert(total_left_brackets == total_right_brackets);

	size_t found = json.find("\"");
	while (found != string::npos) {
	size_t end = json.find("\"", found+1);
	string key = json.substr(found + 1, end - found -1);

	size_t val_begin = json.find(":", end+1);
	size_t val_end = json.find("\"", val_begin+1);

	if (val_end == string::npos) {
	val_end = json.find("}", val_begin+1);
	}

	string value = json.substr(val_begin + 1, val_end - val_begin - 2);
	json_obj[key] = atof(value.c_str());

	found = json.find("\"", end + 2);
	}

	return json_obj;
	}

	void from_file(const char* filename, double *dataset, double output) {
	int l = 0;
	int nread = 0;
	char *buff = new char[1024];
	size_t len;
	FILE *fp = fopen(filename, "r");

	while ((nread = getline(&buff, &len, fp)) != -1) {
	if (l == 0) { l++; continue; }

	map<string, double> json = parse(string(buff));
	dataset[l-1] = from_json_to_input(json);
	output[l-1] = json["Mathematics"];

	l++;
	}

	delete buff;
	fclose(fp);
	}

	int main() {
	int t;
	cin >> t;

	double *dataset = new double[t];
	double *train = new double[79465];
	double *output = new double[79465];
	from_file("training.json", train, output);

	LinearRegression regressor = LinearRegression(11, false);
	regressor.train(280, train, t, output);

	for (int i = 0; i < t; i++) {
	string data;
	cin >> data;
	map<string, double> json = parse(data);
	dataset[i] = from_json_to_input(json);
	}

	regressor.predict(dataset, t);

	for(int i = 0; i < t; ++i) {
	delete [] dataset[i];
	delete [] train[i];
	}
	delete [] dataset;
	delete [] train;

	return 0;
	}