Last active
December 24, 2015 05:38
-
-
Save ybenjo/6751374 to your computer and use it in GitHub Desktop.
Multi-Class Logistic Regression Using SGD.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <fstream> | |
#include <sstream> | |
#include <unordered_map> | |
#include <vector> | |
#include <string> | |
#include <algorithm> | |
#include <boost/random.hpp> | |
#include <iomanip> | |
struct myeq : std::binary_function<std::pair<int, int>, std::pair<int, int>, bool>{ | |
bool operator() (const std::pair<int, int> & x, const std::pair<int, int> & y) const{ | |
return x.first == y.first && x.second == y.second; | |
} | |
}; | |
struct myhash : std::unary_function<std::pair<int, int>, size_t>{ | |
private: | |
const std::hash<int> h_int; | |
public: | |
myhash() : h_int() {} | |
size_t operator()(const std::pair<int, int> & p) const{ | |
size_t seed = h_int(p.first); | |
return h_int(p.second) + 0x9e3779b9 + (seed<<6) + (seed>>2); | |
} | |
}; | |
template<class D, class G = boost::mt19937> | |
class DICE { | |
G gen_; | |
D dst_; | |
boost::variate_generator<G, D> rand_; | |
public: | |
DICE() : gen_(static_cast<unsigned long>(1)), rand_(gen_, dst_) {} | |
template<typename T1> | |
DICE(T1 a1) : gen_(static_cast<unsigned long>(1)), dst_(a1), rand_(gen_, dst_) {} | |
template<typename T1, typename T2> | |
DICE(T1 a1, T2 a2) : gen_(static_cast<unsigned long>(1)), dst_(a1, a2), rand_(gen_, dst_) {} | |
typename D::result_type operator()() { return rand_(); } | |
}; | |
std::vector<std::string> split_string(std::string s, std::string c){ | |
std::vector<std::string> ret; | |
for(int i = 0, n = 0; i <= s.length(); i = n + 1){ | |
n = s.find_first_of(c, i); | |
if(n == std::string::npos) n = s.length(); | |
std::string tmp = s.substr(i, n-i); | |
ret.push_back(tmp); | |
} | |
return ret; | |
} | |
class MultiLogReg{ | |
public: | |
MultiLogReg(int iter_lim){ | |
srand(0); | |
_lim = 0.000000000000001; | |
_eta = 1.0; | |
_iter = 1; | |
_iter_lim = iter_lim; | |
} | |
void set_feature_id(const std::string& feature, int feature_id){ | |
_feature_id[feature] = feature_id; | |
_id_feature[feature_id] = feature; | |
} | |
int get_feature_id(const std::string& feature){ | |
int feature_id = 0; | |
if(_feature_id.find(feature) == _feature_id.end()){ | |
feature_id = _feature_id.size(); | |
set_feature_id(feature, feature_id); | |
}else{ | |
feature_id = _feature_id[feature]; | |
} | |
return feature_id; | |
} | |
void set_label_id(const std::string& label, int label_id){ | |
_label_id[label] = label_id; | |
_id_label[label_id] = label; | |
} | |
int get_label_id(const std::string& label){ | |
int label_id = 0; | |
if(_label_id.find(label) == _label_id.end()){ | |
label_id = _label_id.size(); | |
set_label_id(label, label_id); | |
}else{ | |
label_id = _label_id[label]; | |
} | |
return label_id; | |
} | |
void read_document(char* filename){ | |
std::ifstream ifs; | |
ifs.open(filename, std::ios::in); | |
std::string line; | |
while(getline(ifs, line)){ | |
// file format | |
// label \t w_1:count \t w_2:count ... | |
std::vector<std::string> elem = split_string(line, "\t"); | |
std::string label = elem.at(0); | |
int label_id = get_label_id(label); | |
_labels.push_back(label_id); | |
std::vector<std::pair<int, double> > doc(elem.size() - 1); | |
for(int i = 1; i < elem.size(); ++i){ | |
std::string e = elem.at(i); | |
std::vector<std::string> ary = split_string(e, ":"); | |
std::string feature = ary.at(0); | |
int feature_id = get_feature_id(feature); | |
double val = atof((ary.at(1)).c_str()); | |
doc.at(i - 1) = std::make_pair(feature_id, val); | |
} | |
_documents.push_back(doc); | |
} | |
ifs.close(); | |
} | |
void set_const(){ | |
_N = _documents.size(); | |
_C = _label_id.size(); | |
_K = _feature_id.size(); | |
DICE<boost::uniform_int<> > rand_t(0, (_N - 1)); | |
_rand_t = rand_t; | |
} | |
void initialize_w(){ | |
for(int c = 0; c < _C; ++c){ | |
for(int k = 0; k < _K; ++k){ | |
double r = rand() / RAND_MAX; | |
_w[std::make_pair(c, k)] = r; | |
} | |
} | |
} | |
double logsumexp(double x, double y, bool init_flag){ | |
if(init_flag){ | |
return y; | |
} | |
if(x == y){ | |
return x + log(2); | |
} | |
double vmin = std::min(x, y); | |
double vmax = std::max(x, y); | |
if(vmax > vmin + 50){ | |
return vmax; | |
}else{ | |
return vmax + log(exp(vmin - vmax) + 1.0); | |
} | |
} | |
double prod(const std::vector<std::pair<int, double> >& doc, int label_id){ | |
double score = 0.0; | |
// each feature - value pair | |
std::vector<std::pair<int, double> >::const_iterator it; | |
for(it = doc.begin(); it != doc.end(); ++it){ | |
std::pair<int, double> key = *it; | |
int feature_id = key.first; | |
double val = key.second; | |
score += val * _w[std::make_pair(label_id, feature_id)]; | |
} | |
return score; | |
} | |
std::unordered_map<int, double> predict_document(const std::vector<std::pair<int, double> >& doc){ | |
std::unordered_map<int, double> scores; | |
double sum = 0.0; | |
for(int label_id = 0; label_id < _C; ++label_id){ | |
double val = prod(doc, label_id); | |
scores[label_id] = val; | |
sum = logsumexp(sum, val, label_id == 0); | |
} | |
// normalize | |
for(int label_id = 0; label_id < _C; ++label_id){ | |
scores[label_id] = exp(scores[label_id] - sum); | |
} | |
return scores; | |
} | |
// update gradient E | |
void gradient_E(int doc_id){ | |
_e.clear(); | |
std::vector<std::pair<int, double> > doc = _documents.at(doc_id); | |
std::vector<std::pair<int, double> >::iterator it; | |
std::unordered_map<int, double> scores = predict_document(doc); | |
int label = _labels.at(doc_id); | |
for(int label_id = 0; label_id < _C; ++label_id){ | |
double t = label == label_id ? 1.0 : 0.0; | |
double y = scores[label_id]; | |
double diff = (t - y); | |
for(it = doc.begin(); it != doc.end(); ++it){ | |
std::pair<int, double> key = *it; | |
int feature_id = key.first; | |
double val = key.second; | |
_e[std::make_pair(label_id, feature_id)] += diff * val; | |
} | |
} | |
} | |
void update_w(){ | |
for(int i = 0; i < 10; ++i){ | |
int doc_id = _rand_t(); | |
gradient_E(doc_id); | |
} | |
std::unordered_map<std::pair<int, int>, double, myhash, myeq>::iterator it; | |
for(it = _e.begin(); it != _e.end(); ++it){ | |
std::pair<int, int> key = (*it).first; | |
double val = (*it).second; | |
double old_w = _w[key]; | |
// Use L2-reguraliztion | |
_w[key] += _eta * (val - _lambda * old_w); | |
} | |
} | |
void set_eta(int iter){ | |
_eta = 1.0 / iter; | |
} | |
// converge? | |
bool converge(){ | |
std::vector<double> e; | |
std::unordered_map<std::pair<int, int>, double, myhash, myeq>::iterator it; | |
for(it = _e.begin(); it != _e.end(); ++it){ | |
e.push_back((*it).second); | |
} | |
if(e.size() == 0) return false; | |
double abs_max_e = fabs(*max_element(e.begin(), e.end())); | |
return abs_max_e <= _lim; | |
} | |
void train(double lambda){ | |
_lambda = lambda; | |
set_const(); | |
initialize_w(); | |
_iter = 1; | |
while((!converge()) && (_iter < _iter_lim)){ | |
set_eta(_iter); | |
update_w(); | |
++_iter; | |
} | |
} | |
// too slow | |
double cross_entropy_error(){ | |
double ret = 0.0; | |
for(int doc_id = 0; doc_id < _N; ++doc_id){ | |
int label = _labels.at(doc_id); | |
std::vector<std::pair<int, double> > doc = _documents.at(doc_id); | |
std::unordered_map<int, double> scores = predict_document(doc); | |
for(int label_id = 0; label_id < _C; ++label_id){ | |
double y = scores[label_id]; | |
if(label == label_id){ | |
ret += log(y); | |
} | |
} | |
} | |
return -ret; | |
} | |
std::unordered_map<std::pair<int, int>, double, myhash, myeq> predict(){ | |
std::unordered_map<std::pair<int, int>, double, myhash, myeq> ret; | |
for(int doc_id = 0; doc_id < _N; ++doc_id){ | |
std::vector<std::pair<int, double> > doc = _documents.at(doc_id); | |
std::unordered_map<int, double> scores = predict_document(doc); | |
for(int label_id = 0; label_id < _C; ++label_id){ | |
double val = scores[label_id]; | |
ret[std::make_pair(doc_id, label_id)] = val; | |
} | |
} | |
return ret; | |
} | |
double get_training_accuracy(){ | |
double accuracy = 0.0; | |
std::unordered_map<std::pair<int, int>, double, myhash, myeq> ret = predict(); | |
for(int doc_id = 0; doc_id < _N; ++doc_id){ | |
double max_score = -1; | |
int max_label_id = -1; | |
for(int label_id = 0; label_id < _C; ++label_id){ | |
double score = ret[std::make_pair(doc_id, label_id)]; | |
if(score > max_score){ | |
max_score = score; | |
max_label_id = label_id; | |
} | |
} | |
int ans_label_id = _labels.at(doc_id); | |
if(ans_label_id == max_label_id) ++accuracy; | |
} | |
return accuracy / _N; | |
} | |
// output model | |
void output_model(char* filename){ | |
std::ofstream ofs; | |
ofs.open(filename); | |
// other information | |
ofs << "# iter:" << _iter<< std::endl; | |
ofs << "# training accuracy:" << get_training_accuracy() << std::endl; | |
std::unordered_map<std::string, int>::iterator it; | |
// output feature <=> id | |
for(it = _feature_id.begin(); it != _feature_id.end(); ++it){ | |
std::string feature = (*it).first; | |
int id = (*it).second; | |
ofs << "feature\t" << feature << "\t" << id << std::endl; | |
} | |
// output label <=> id | |
for(it = _label_id.begin(); it != _label_id.end(); ++it){ | |
std::string label = (*it).first; | |
int id = (*it).second; | |
ofs << "label\t" << label << "\t" << id << std::endl; | |
} | |
// output label, feature_id, weight_value | |
std::unordered_map<std::pair<int, int>, double, myhash, myeq>::iterator i; | |
for(i = _w.begin(); i != _w.end(); ++i){ | |
int label_id = ((*i).first).first; | |
int feature_id = ((*i).first).second; | |
double val = (*i).second; | |
ofs << "w\t" << label_id << "\t" << feature_id<< "\t" << std::setprecision(20) << val << std::endl; | |
} | |
ofs.close(); | |
} | |
void read_model(char* filename){ | |
std::ifstream ifs; | |
ifs.open(filename, std::ios::in); | |
std::string line; | |
while(getline(ifs, line)){ | |
std::vector<std::string> elem = split_string(line, "\t"); | |
std::string type = elem.at(0); | |
if(type == "feature"){ | |
// format: feature \t feature_name \t id | |
std::string feature_name = elem.at(1); | |
int feature_id = atoi((elem.at(2)).c_str()); | |
set_feature_id(feature_name, feature_id); | |
}else if(type == "label"){ | |
// format: label \t label_name \t id | |
std::string label_name = elem.at(1); | |
int label_id = atoi((elem.at(2)).c_str()); | |
set_label_id(label_name, label_id); | |
}else if(type == "w"){ | |
// format: w \t label_id \t feature_id \t val | |
int label_id = atoi((elem.at(1)).c_str()); | |
int feature_id = atoi((elem.at(2)).c_str()); | |
double val = atof((elem.at(3)).c_str()); | |
_w[std::make_pair(label_id, feature_id)] = val; | |
} | |
set_const(); | |
} | |
ifs.close(); | |
} | |
void output_results(char* filename){ | |
// output results | |
std::ofstream ofs; | |
ofs.open(filename); | |
std::unordered_map<std::pair<int, int>, double, myhash, myeq> scores = predict(); | |
for(int doc_id = 0; doc_id < _N; ++doc_id){ | |
for(int label_id = 0; label_id < _C; ++label_id){ | |
double val = scores[std::make_pair(doc_id, label_id)]; | |
ofs << doc_id << "\t" << _id_label[label_id] << "\t" << val << std::endl; | |
} | |
} | |
ofs.close(); | |
} | |
private: | |
// weights | |
std::unordered_map<std::pair<int, int>, double, myhash, myeq> _w; | |
// documents | |
std::vector<std::vector<std::pair<int, double> > > _documents; | |
// label | |
std::vector<int> _labels; | |
// feature dictionary | |
std::unordered_map<std::string, int> _feature_id; | |
std::unordered_map<int, std::string> _id_feature; | |
// labels | |
std::unordered_map<std::string, int> _label_id; | |
std::unordered_map<int, std::string> _id_label; | |
// cross entropy error | |
std::vector<double> _cee; | |
// converge | |
double _lim; | |
// parameters | |
double _eta; | |
// size of classes | |
int _C; | |
// size of documents | |
int _N; | |
// size of features | |
int _K; | |
// regularization | |
double _lambda; | |
// gradient(E) | |
std::unordered_map<std::pair<int, int>, double, myhash, myeq> _e; | |
// iteration of update | |
int _iter; | |
int _iter_lim; | |
DICE<boost::uniform_int<> > _rand_t; | |
}; | |
int main(int argc, char** argv){ | |
std::string type = argv[1]; | |
char *doc_file = argv[2]; | |
char *model_file = argv[3]; | |
int iter_lim = 5000; | |
if(argc > 5){ | |
iter_lim = atoi(argv[5]); | |
} | |
MultiLogReg t(iter_lim); | |
t.read_document(doc_file); | |
if(type == "train"){ | |
double lambda = atof(argv[4]); | |
t.train(lambda); | |
t.output_model(model_file); | |
}else if(type == "predict"){ | |
t.read_model(model_file); | |
char *output_file = argv[4]; | |
t.output_results(output_file); | |
}else{ | |
std::cout << "usage:" << std::endl; | |
std::cout << "./a.out train doc_file model_file" << std::endl; | |
std::cout << "./a.out predict doc_file model_file result_file" << std::endl; | |
exit(1); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment