Skip to content

Instantly share code, notes, and snippets.

@ybenjo
Last active December 24, 2015 05:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ybenjo/6751374 to your computer and use it in GitHub Desktop.
Save ybenjo/6751374 to your computer and use it in GitHub Desktop.
Multi-Class Logistic Regression Using SGD.
#include <iostream>
#include <fstream>
#include <sstream>
#include <unordered_map>
#include <vector>
#include <string>
#include <algorithm>
#include <boost/random.hpp>
#include <iomanip>
struct myeq : std::binary_function<std::pair<int, int>, std::pair<int, int>, bool>{
bool operator() (const std::pair<int, int> & x, const std::pair<int, int> & y) const{
return x.first == y.first && x.second == y.second;
}
};
struct myhash : std::unary_function<std::pair<int, int>, size_t>{
private:
const std::hash<int> h_int;
public:
myhash() : h_int() {}
size_t operator()(const std::pair<int, int> & p) const{
size_t seed = h_int(p.first);
return h_int(p.second) + 0x9e3779b9 + (seed<<6) + (seed>>2);
}
};
template<class D, class G = boost::mt19937>
class DICE {
G gen_;
D dst_;
boost::variate_generator<G, D> rand_;
public:
DICE() : gen_(static_cast<unsigned long>(1)), rand_(gen_, dst_) {}
template<typename T1>
DICE(T1 a1) : gen_(static_cast<unsigned long>(1)), dst_(a1), rand_(gen_, dst_) {}
template<typename T1, typename T2>
DICE(T1 a1, T2 a2) : gen_(static_cast<unsigned long>(1)), dst_(a1, a2), rand_(gen_, dst_) {}
typename D::result_type operator()() { return rand_(); }
};
std::vector<std::string> split_string(std::string s, std::string c){
std::vector<std::string> ret;
for(int i = 0, n = 0; i <= s.length(); i = n + 1){
n = s.find_first_of(c, i);
if(n == std::string::npos) n = s.length();
std::string tmp = s.substr(i, n-i);
ret.push_back(tmp);
}
return ret;
}
class MultiLogReg{
public:
MultiLogReg(int iter_lim){
srand(0);
_lim = 0.000000000000001;
_eta = 1.0;
_iter = 1;
_iter_lim = iter_lim;
}
void set_feature_id(const std::string& feature, int feature_id){
_feature_id[feature] = feature_id;
_id_feature[feature_id] = feature;
}
int get_feature_id(const std::string& feature){
int feature_id = 0;
if(_feature_id.find(feature) == _feature_id.end()){
feature_id = _feature_id.size();
set_feature_id(feature, feature_id);
}else{
feature_id = _feature_id[feature];
}
return feature_id;
}
void set_label_id(const std::string& label, int label_id){
_label_id[label] = label_id;
_id_label[label_id] = label;
}
int get_label_id(const std::string& label){
int label_id = 0;
if(_label_id.find(label) == _label_id.end()){
label_id = _label_id.size();
set_label_id(label, label_id);
}else{
label_id = _label_id[label];
}
return label_id;
}
void read_document(char* filename){
std::ifstream ifs;
ifs.open(filename, std::ios::in);
std::string line;
while(getline(ifs, line)){
// file format
// label \t w_1:count \t w_2:count ...
std::vector<std::string> elem = split_string(line, "\t");
std::string label = elem.at(0);
int label_id = get_label_id(label);
_labels.push_back(label_id);
std::vector<std::pair<int, double> > doc(elem.size() - 1);
for(int i = 1; i < elem.size(); ++i){
std::string e = elem.at(i);
std::vector<std::string> ary = split_string(e, ":");
std::string feature = ary.at(0);
int feature_id = get_feature_id(feature);
double val = atof((ary.at(1)).c_str());
doc.at(i - 1) = std::make_pair(feature_id, val);
}
_documents.push_back(doc);
}
ifs.close();
}
void set_const(){
_N = _documents.size();
_C = _label_id.size();
_K = _feature_id.size();
DICE<boost::uniform_int<> > rand_t(0, (_N - 1));
_rand_t = rand_t;
}
void initialize_w(){
for(int c = 0; c < _C; ++c){
for(int k = 0; k < _K; ++k){
double r = rand() / RAND_MAX;
_w[std::make_pair(c, k)] = r;
}
}
}
double logsumexp(double x, double y, bool init_flag){
if(init_flag){
return y;
}
if(x == y){
return x + log(2);
}
double vmin = std::min(x, y);
double vmax = std::max(x, y);
if(vmax > vmin + 50){
return vmax;
}else{
return vmax + log(exp(vmin - vmax) + 1.0);
}
}
double prod(const std::vector<std::pair<int, double> >& doc, int label_id){
double score = 0.0;
// each feature - value pair
std::vector<std::pair<int, double> >::const_iterator it;
for(it = doc.begin(); it != doc.end(); ++it){
std::pair<int, double> key = *it;
int feature_id = key.first;
double val = key.second;
score += val * _w[std::make_pair(label_id, feature_id)];
}
return score;
}
std::unordered_map<int, double> predict_document(const std::vector<std::pair<int, double> >& doc){
std::unordered_map<int, double> scores;
double sum = 0.0;
for(int label_id = 0; label_id < _C; ++label_id){
double val = prod(doc, label_id);
scores[label_id] = val;
sum = logsumexp(sum, val, label_id == 0);
}
// normalize
for(int label_id = 0; label_id < _C; ++label_id){
scores[label_id] = exp(scores[label_id] - sum);
}
return scores;
}
// update gradient E
void gradient_E(int doc_id){
_e.clear();
std::vector<std::pair<int, double> > doc = _documents.at(doc_id);
std::vector<std::pair<int, double> >::iterator it;
std::unordered_map<int, double> scores = predict_document(doc);
int label = _labels.at(doc_id);
for(int label_id = 0; label_id < _C; ++label_id){
double t = label == label_id ? 1.0 : 0.0;
double y = scores[label_id];
double diff = (t - y);
for(it = doc.begin(); it != doc.end(); ++it){
std::pair<int, double> key = *it;
int feature_id = key.first;
double val = key.second;
_e[std::make_pair(label_id, feature_id)] += diff * val;
}
}
}
void update_w(){
for(int i = 0; i < 10; ++i){
int doc_id = _rand_t();
gradient_E(doc_id);
}
std::unordered_map<std::pair<int, int>, double, myhash, myeq>::iterator it;
for(it = _e.begin(); it != _e.end(); ++it){
std::pair<int, int> key = (*it).first;
double val = (*it).second;
double old_w = _w[key];
// Use L2-reguraliztion
_w[key] += _eta * (val - _lambda * old_w);
}
}
void set_eta(int iter){
_eta = 1.0 / iter;
}
// converge?
bool converge(){
std::vector<double> e;
std::unordered_map<std::pair<int, int>, double, myhash, myeq>::iterator it;
for(it = _e.begin(); it != _e.end(); ++it){
e.push_back((*it).second);
}
if(e.size() == 0) return false;
double abs_max_e = fabs(*max_element(e.begin(), e.end()));
return abs_max_e <= _lim;
}
void train(double lambda){
_lambda = lambda;
set_const();
initialize_w();
_iter = 1;
while((!converge()) && (_iter < _iter_lim)){
set_eta(_iter);
update_w();
++_iter;
}
}
// too slow
double cross_entropy_error(){
double ret = 0.0;
for(int doc_id = 0; doc_id < _N; ++doc_id){
int label = _labels.at(doc_id);
std::vector<std::pair<int, double> > doc = _documents.at(doc_id);
std::unordered_map<int, double> scores = predict_document(doc);
for(int label_id = 0; label_id < _C; ++label_id){
double y = scores[label_id];
if(label == label_id){
ret += log(y);
}
}
}
return -ret;
}
std::unordered_map<std::pair<int, int>, double, myhash, myeq> predict(){
std::unordered_map<std::pair<int, int>, double, myhash, myeq> ret;
for(int doc_id = 0; doc_id < _N; ++doc_id){
std::vector<std::pair<int, double> > doc = _documents.at(doc_id);
std::unordered_map<int, double> scores = predict_document(doc);
for(int label_id = 0; label_id < _C; ++label_id){
double val = scores[label_id];
ret[std::make_pair(doc_id, label_id)] = val;
}
}
return ret;
}
double get_training_accuracy(){
double accuracy = 0.0;
std::unordered_map<std::pair<int, int>, double, myhash, myeq> ret = predict();
for(int doc_id = 0; doc_id < _N; ++doc_id){
double max_score = -1;
int max_label_id = -1;
for(int label_id = 0; label_id < _C; ++label_id){
double score = ret[std::make_pair(doc_id, label_id)];
if(score > max_score){
max_score = score;
max_label_id = label_id;
}
}
int ans_label_id = _labels.at(doc_id);
if(ans_label_id == max_label_id) ++accuracy;
}
return accuracy / _N;
}
// output model
void output_model(char* filename){
std::ofstream ofs;
ofs.open(filename);
// other information
ofs << "# iter:" << _iter<< std::endl;
ofs << "# training accuracy:" << get_training_accuracy() << std::endl;
std::unordered_map<std::string, int>::iterator it;
// output feature <=> id
for(it = _feature_id.begin(); it != _feature_id.end(); ++it){
std::string feature = (*it).first;
int id = (*it).second;
ofs << "feature\t" << feature << "\t" << id << std::endl;
}
// output label <=> id
for(it = _label_id.begin(); it != _label_id.end(); ++it){
std::string label = (*it).first;
int id = (*it).second;
ofs << "label\t" << label << "\t" << id << std::endl;
}
// output label, feature_id, weight_value
std::unordered_map<std::pair<int, int>, double, myhash, myeq>::iterator i;
for(i = _w.begin(); i != _w.end(); ++i){
int label_id = ((*i).first).first;
int feature_id = ((*i).first).second;
double val = (*i).second;
ofs << "w\t" << label_id << "\t" << feature_id<< "\t" << std::setprecision(20) << val << std::endl;
}
ofs.close();
}
void read_model(char* filename){
std::ifstream ifs;
ifs.open(filename, std::ios::in);
std::string line;
while(getline(ifs, line)){
std::vector<std::string> elem = split_string(line, "\t");
std::string type = elem.at(0);
if(type == "feature"){
// format: feature \t feature_name \t id
std::string feature_name = elem.at(1);
int feature_id = atoi((elem.at(2)).c_str());
set_feature_id(feature_name, feature_id);
}else if(type == "label"){
// format: label \t label_name \t id
std::string label_name = elem.at(1);
int label_id = atoi((elem.at(2)).c_str());
set_label_id(label_name, label_id);
}else if(type == "w"){
// format: w \t label_id \t feature_id \t val
int label_id = atoi((elem.at(1)).c_str());
int feature_id = atoi((elem.at(2)).c_str());
double val = atof((elem.at(3)).c_str());
_w[std::make_pair(label_id, feature_id)] = val;
}
set_const();
}
ifs.close();
}
void output_results(char* filename){
// output results
std::ofstream ofs;
ofs.open(filename);
std::unordered_map<std::pair<int, int>, double, myhash, myeq> scores = predict();
for(int doc_id = 0; doc_id < _N; ++doc_id){
for(int label_id = 0; label_id < _C; ++label_id){
double val = scores[std::make_pair(doc_id, label_id)];
ofs << doc_id << "\t" << _id_label[label_id] << "\t" << val << std::endl;
}
}
ofs.close();
}
private:
// weights
std::unordered_map<std::pair<int, int>, double, myhash, myeq> _w;
// documents
std::vector<std::vector<std::pair<int, double> > > _documents;
// label
std::vector<int> _labels;
// feature dictionary
std::unordered_map<std::string, int> _feature_id;
std::unordered_map<int, std::string> _id_feature;
// labels
std::unordered_map<std::string, int> _label_id;
std::unordered_map<int, std::string> _id_label;
// cross entropy error
std::vector<double> _cee;
// converge
double _lim;
// parameters
double _eta;
// size of classes
int _C;
// size of documents
int _N;
// size of features
int _K;
// regularization
double _lambda;
// gradient(E)
std::unordered_map<std::pair<int, int>, double, myhash, myeq> _e;
// iteration of update
int _iter;
int _iter_lim;
DICE<boost::uniform_int<> > _rand_t;
};
int main(int argc, char** argv){
std::string type = argv[1];
char *doc_file = argv[2];
char *model_file = argv[3];
int iter_lim = 5000;
if(argc > 5){
iter_lim = atoi(argv[5]);
}
MultiLogReg t(iter_lim);
t.read_document(doc_file);
if(type == "train"){
double lambda = atof(argv[4]);
t.train(lambda);
t.output_model(model_file);
}else if(type == "predict"){
t.read_model(model_file);
char *output_file = argv[4];
t.output_results(output_file);
}else{
std::cout << "usage:" << std::endl;
std::cout << "./a.out train doc_file model_file" << std::endl;
std::cout << "./a.out predict doc_file model_file result_file" << std::endl;
exit(1);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment