Skip to content

Instantly share code, notes, and snippets.

@felipecruz
Last active September 22, 2015 14:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save felipecruz/abf630670c137530c1d5 to your computer and use it in GitHub Desktop.
Save felipecruz/abf630670c137530c1d5 to your computer and use it in GitHub Desktop.
cpp linear regression - no regularization
/* Author: felipe cruz felipecruz@gmail.com
*/
#include <iostream>
#include <cmath>
#include <cstdio>
#include <vector>
#include <algorithm>
#include <map>
#include <random>
#include <assert.h>
#include <stdio.h>
using namespace std;
double normalize(double val, double min, double max) {
return (val - min) / (max - min);
}
class Uniform {
std::default_random_engine generator;
std::uniform_real_distribution<double> distribution;
public:
Uniform(double min, double max) {
distribution = uniform_real_distribution<double>(min, max);
}
double sample() {
return distribution(generator);
}
};
void print(double *w, int size) {
cout << "[" << w[0];
for (int i = 0; i < size; i++) {
cout << ", " << w[i];
}
cout << "]" << endl;
}
class LinearRegression {
double *weights;
int size;
bool free;
bool _debug;
public:
LinearRegression(int size, bool debug = false) : size(size) {
weights = new double[size];
Uniform uniform = Uniform(0., 1.);
for (int i = 0; i < size; i++) {
weights[i] = uniform.sample();
}
free = true;
_debug = debug;
}
LinearRegression(double *_weights, int size, bool debug = false) : size(size) {
weights = _weights;
free = false;
_debug = debug;
}
double dot(double *in, double *w) {
double total = 0.0;
for (int i = 0; i < size; i++) {
total += in[i] * w[i];
}
return total;
}
void update(double *gradient) {
double learning_rate = 0.011;
for (int i = 0; i < size; i++) {
weights[i] += -(learning_rate * gradient[i]);
}
}
void train(int epochs, double **dataset, int length, double *output) {
for (int i = 0; i < epochs; i++) {
if (_debug) {
cout << "Epoch: " << i << " ";
}
learn(dataset, length, output);
}
}
void predict(double **dataset, int length) {
double *predictions = new double[length];
for (int i = 0; i < length; i++) {
predictions[i] = dot(dataset[i], weights);
}
double *min = std::min_element(predictions, predictions + length);
double *max = std::max_element(predictions, predictions + length);
for (int i = 0; i < length; i++) {
int pred = (int) (predictions[i]);
if (pred > 8) {
pred = 8;
cout << pred << endl;
} else if (pred >= 1 && pred <= 8) {
cout << pred << endl;
} else {
cout << 1 << endl;
}
}
delete predictions;
}
void debug() {
print(weights, size);
}
void learn(double **dataset, int length, double *output) {
double error = 0.0;
double *gradient = new double[size];
for (int i = 0; i < size; i++) {
gradient[i] = 0.0;
}
double l2_norm = sqrt(dot(weights, weights));
double error_count = 0;
double correct_count = 0;
for (int i = 0; i < length; i++) {
double prediction = dot(dataset[i], weights);
double single_error = output[i] - prediction;
if (abs(floor(single_error)) == 0 || abs(floor(single_error)) == 1) {
correct_count++;
} else {
error_count++;
}
error += single_error*single_error;
for (int j = 0; j < size; j++) {
gradient[j] += (-2./length) * (dataset[i][j] * single_error);
}
}
update(gradient);
delete gradient;
error *= 1./length;
error = sqrt(error);
if (_debug) {
cout << "Error: " << error << " total: " << error_count
<< " - " << 100. * ((correct_count - error_count) / length) << endl;
}
}
~LinearRegression() {
if (free)
delete weights;
}
};
vector<string> schema = vector<string>({
"Accountancy",
"Biology",
"BusinessStudies",
"Chemistry",
"ComputerScience",
"Economics",
"English",
"Hindi",
"Physics",
"PhysicalEducation"
});
const string label("Mathematics");
double* from_json_to_input(map<string, double>& grades) {
double *wgrades = new double[11];
double total = 0.0;
int ngrades = 0;
for (int i = 0; i < schema.size(); i++) {
if (grades.count(schema[i]) == 1) {
wgrades[i] = grades[schema[i]];
total += wgrades[i];
ngrades++;
} else {
wgrades[i] = 0.0;
}
}
wgrades[10] = 0.;
return wgrades;
}
map<string, double> parse(string json) {
map<string, double> json_obj = map<string, double>();
size_t total_left_brackets = count(json.begin(), json.end(), '{');
size_t total_right_brackets = count(json.begin(), json.end(), '}');
assert(total_left_brackets == total_right_brackets);
size_t found = json.find("\"");
while (found != string::npos) {
size_t end = json.find("\"", found+1);
string key = json.substr(found + 1, end - found -1);
size_t val_begin = json.find(":", end+1);
size_t val_end = json.find("\"", val_begin+1);
if (val_end == string::npos) {
val_end = json.find("}", val_begin+1);
}
string value = json.substr(val_begin + 1, val_end - val_begin - 2);
json_obj[key] = atof(value.c_str());
found = json.find("\"", end + 2);
}
return json_obj;
}
void from_file(const char* filename, double **dataset, double *output) {
int l = 0;
int nread = 0;
char *buff = new char[1024];
size_t len;
FILE *fp = fopen(filename, "r");
while ((nread = getline(&buff, &len, fp)) != -1) {
if (l == 0) { l++; continue; }
map<string, double> json = parse(string(buff));
dataset[l-1] = from_json_to_input(json);
output[l-1] = json["Mathematics"];
l++;
}
delete buff;
fclose(fp);
}
int main() {
int t;
cin >> t;
double **dataset = new double*[t];
double **train = new double*[79465];
double *output = new double[79465];
from_file("training.json", train, output);
LinearRegression regressor = LinearRegression(11, false);
regressor.train(280, train, t, output);
for (int i = 0; i < t; i++) {
string data;
cin >> data;
map<string, double> json = parse(data);
dataset[i] = from_json_to_input(json);
}
regressor.predict(dataset, t);
for(int i = 0; i < t; ++i) {
delete [] dataset[i];
delete [] train[i];
}
delete [] dataset;
delete [] train;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment