ybenjo/mult_logreg.cc

## mult_logreg.cc
#include <iostream>
#include <fstream>
#include <sstream>
#include <unordered_map>
#include <vector>
#include <string>
#include <algorithm>

typedef std::pair<int, int> key;

struct myeq : std::binary_function<std::pair<int, int>, std::pair<int, int>, bool>{
  bool operator() (const std::pair<int, int> & x, const std::pair<int, int> & y) const{
    return x.first == y.first && x.second == y.second;
  }
};

struct myhash : std::unary_function<std::pair<int, int>, size_t>{
private:
  const std::hash<int> h_int;
public:
  myhash() : h_int() {}
  size_t operator()(const std::pair<int, int> & p) const{
    size_t seed = h_int(p.first);
    return h_int(p.second) + 0x9e3779b9 + (seed<<6) + (seed>>2);
  }
};

std::vector<std::string> split_string(std::string s, std::string c){
  std::vector<std::string> ret;
  for(int i = 0, n = 0; i <= s.length(); i = n + 1){
    n = s.find_first_of(c, i);
    if(n == std::string::npos) n = s.length();
    std::string tmp = s.substr(i, n-i);
    ret.push_back(tmp);
  }
  return ret;
}

class MultiLogReg{
public:
  MultiLogReg(){
    srand(0);
    _lim = 0.001;
  }

  MultiLogReg(double eta = 0.0001){
    _eta = eta;
  }

  void set_feature_id(const std::string& feature, int feature_id){
    _feature_id[feature] = feature_id;
    _id_feature[feature_id] = feature;
  }

  int get_feature_id(const std::string& feature){
    int feature_id = 0;
    if(_feature_id.find(feature) == _feature_id.end()){
      feature_id = _feature_id.size();
      set_feature_id(feature, feature_id);
    }else{
      _feature_id[feature] = feature_id;
    }
    return feature_id;
  }

  void set_label_id(const std::string& label, int label_id){
    _label_id[label] = label_id;
    _id_label[label_id] = label;
  }

  int get_label_id(const std::string& label){
    int label_id = 0;
    if(_label_id.find(label) == _label_id.end()){
      label_id = _label_id.size();
      set_label_id(label, label_id);
    }else{
      label_id = _label_id[label];
    }
    return label_id;
  }

  void read_document(char* filename){
    std::ifstream ifs;
    ifs.open(filename, std::ios::in);
    std::string line;
    while(getline(ifs, line)){
      // file format
      // label \t w_1:count \t w_2:count ...
      std::vector<std::string> elem = split_string(line, "\t");

      std::string label = elem.at(0);
      int label_id = get_label_id(label);
      _labels.push_back(label_id);

      std::vector<std::pair<int, double> > doc(elem.size() - 1);
      for(int i = 1; i <  elem.size(); ++i){
        std::string e = elem.at(i);
        std::vector<std::string> ary = split_string(line, ":");
        std::string feature = ary.at(0);
        int feature_id = get_feature_id(feature);
        double count = atof((ary.at(1)).c_str());
        doc.at(i - 1) = std::make_pair(feature_id, count);
      }

      _documents.push_back(doc);
    }
    ifs.close();
    _N = _documents.size();
    _C = _label_id.size();
    _K = _feature_id.size();
  }

  void initialize_w(){
    for(int c = 0; c < _C; ++c){
      for(int k = 0; k < _K; ++k){
	      double r = rand() / RAND_MAX;
        _w[std::make_pair(c, k)] = r;
      }
    }
  }

  // read w
  void set_w(char* filename){
  }

  double logsumexp(double x, double y, bool init_flag){
    if(init_flag){
      return y;
    }

    if(x == y){
      return x + log(2);
    }

    double vmin = std::min(x, y);
    double vmax = std::max(x, y);
    if(vmax > vmin + 50){
      return vmax;
    }else{
      return vmax + log(exp(vmin - vmax) + 1.0);
    }
  }

  double prod(const std::vector<std::pair<int, double> >& doc, int class_id){
    double score = 0.0;

    // each feature - value pair
    std::vector<std::pair<int, double> >::const_iterator it;
    for(it = doc.begin(); it != doc.end(); ++it){
      std::pair<int, double> key = *it;
      int feature_id = key.first;
      double val = key.second;
      score += val * _w[std::make_pair(class_id, feature_id)];
    }
    return score;
  }

  std::unordered_map<int, double> predict_document(const std::vector<std::pair<int, double> >& doc){
    std::unordered_map<int, double> scores;
    double sum = 0.0;

    for(int class_id = 0; class_id < _C; ++class_id){
      double val = prod(doc, class_id);
      scores[class_id] = val;
      sum = logsumexp(sum, val, class_id == 0);
    }

    // normalize
    for(int class_id = 0; class_id < _C; ++class_id){
      scores[class_id] = exp(scores[class_id] - sum);
    }

    return scores;
  }

  std::unordered_map<std::pair<int, int>, double, myhash, myeq> gradient_E(){
    // key: <class_id, feature_id>
    std::unordered_map<std::pair<int, int>, double, myhash, myeq> grad;
    std::vector<std::pair<int, double> >::iterator it;

    for(int doc_pos = 0; doc_pos < _N; ++doc_pos){
      std::vector<std::pair<int, double> > doc = _documents.at(doc_pos);
      std::unordered_map<int, double> scores = predict_document(doc);
      int label = _labels.at(doc_pos);

      for(int class_id = 0; class_id < _C; ++class_id){
        double t = label == class_id ? 1.0 : 0.0;
        double y = scores[class_id];
        double diff = (y - t);

        for(it = doc.begin(); it != doc.end(); ++it){
          std::pair<int, double> key = *it;
          int feature_id = key.first;
          double val = key.second;
          grad[std::make_pair(class_id, feature_id)] += (diff * val);
        }
      }
    }
    return grad;
  }

  void update_w(){
    std::unordered_map<std::pair<int, int>, double, myhash, myeq> grad = gradient_E();

    for(int class_id = 0; class_id < _C; ++class_id){
      for(int feature_id = 0; feature_id < _K; ++feature_id){
	      std::pair<int, int> key = std::make_pair(class_id, feature_id);
        // w_new = w_old - eta * grad
        _w[key] = _w[key] - _eta * grad[key];
      }
    }
  }

  // converge?
  bool converge(){
    double val = cross_entropy_error();
    _cee.push_back(val);
    if(_cee.size() < 2) return false;
    return fabs(_cee.at(_cee.size() - 1) - _cee.at(_cee.size() - 2)) < _lim;
  }

  void train(){
    initialize_w();
    while(!converge()){
      update_w();
      predict();
    }
  }

  double cross_entropy_error(){
    double ret;
    for(int doc_id = 0; doc_id < _N; ++doc_id){
      int label = _labels.at(doc_id);
      std::vector<std::pair<int, double> > doc = _documents.at(doc_id);
      std::unordered_map<int, double> scores = predict_document(doc);
      for(int class_id = 0; class_id < _C; ++class_id){
        double t = label == class_id ? 1.0 : 0.0;
        double y = scores[class_id];
        ret += t * log(y);
      }
    }
    return -ret;
  }

  void predict(){
    double correct = 0.0;
    for(int doc_id = 0; doc_id < _N; ++doc_id){
      int answer = _labels.at(doc_id);
      std::vector<std::pair<int, double> > doc = _documents.at(doc_id);
      std::unordered_map<int, double> scores = predict_document(doc);
      int predict_id = -1;
      double max_score = -1.0;
      std::unordered_map<int, double>::iterator it;
      for(it = scores.begin(); it != scores.end(); ++it){
	      int id = (*it).first;
        double val = (*it).second;
          if(max_score < val){
            predict_id = id;
            max_score = val;
          }
        }
      if(answer == predict_id) ++correct;
    }
    std::cout <<"accuracy:"  << correct / _N << std::endl;
  }

  // output model
  void output_model(char* filename){
    // output model
    std::ostringstream oss;
    oss << filename << ".model";
    std::ofstream ofs;
    ofs.open((oss.str()).c_str());
    ofs.close();
  }

  void output_results(char* filename){
    // output results
    std::ostringstream oss;
    oss << filename << ".results";
    std::ofstream ofs;
    ofs.open((oss.str()).c_str());
    ofs.close();
  }

private:
  // weights
  std::unordered_map<std::pair<int, int>, double, myhash, myeq> _w;
  // documents
  std::vector<std::vector<std::pair<int, double> > > _documents;
  // label
  std::vector<int> _labels;

  // feature dictionary
  std::unordered_map<std::string, int> _feature_id;
  std::unordered_map<int, std::string> _id_feature;
  // labels
  std::unordered_map<std::string, int> _label_id;
  std::unordered_map<int, std::string> _id_label;
  // cross entropy error
  std::vector<double> _cee;
  double _lim;

  // parameters
  double _eta;
  // size of classes
  int _C;
  // size of documents
  int _N;
  // size of features
  int _K;
};

int main(int argc, char** argv){
  char *filename = argv[1];
  MultiLogReg t(0.1);
  t.read_document(filename);
  t.train();
}
	#include <iostream>
	#include <fstream>
	#include <sstream>
	#include <unordered_map>
	#include <vector>
	#include <string>
	#include <algorithm>

	typedef std::pair<int, int> key;

	struct myeq : std::binary_function<std::pair<int, int>, std::pair<int, int>, bool>{
	bool operator() (const std::pair<int, int> & x, const std::pair<int, int> & y) const{
	return x.first == y.first && x.second == y.second;
	}
	};

	struct myhash : std::unary_function<std::pair<int, int>, size_t>{
	private:
	const std::hash<int> h_int;
	public:
	myhash() : h_int() {}
	size_t operator()(const std::pair<int, int> & p) const{
	size_t seed = h_int(p.first);
	return h_int(p.second) + 0x9e3779b9 + (seed<<6) + (seed>>2);
	}
	};

	std::vector<std::string> split_string(std::string s, std::string c){
	std::vector<std::string> ret;
	for(int i = 0, n = 0; i <= s.length(); i = n + 1){
	n = s.find_first_of(c, i);
	if(n == std::string::npos) n = s.length();
	std::string tmp = s.substr(i, n-i);
	ret.push_back(tmp);
	}
	return ret;
	}

	class MultiLogReg{
	public:
	MultiLogReg(){
	srand(0);
	_lim = 0.001;
	}

	MultiLogReg(double eta = 0.0001){
	_eta = eta;
	}

	void set_feature_id(const std::string& feature, int feature_id){
	_feature_id[feature] = feature_id;
	_id_feature[feature_id] = feature;
	}

	int get_feature_id(const std::string& feature){
	int feature_id = 0;
	if(_feature_id.find(feature) == _feature_id.end()){
	feature_id = _feature_id.size();
	set_feature_id(feature, feature_id);
	}else{
	_feature_id[feature] = feature_id;
	}
	return feature_id;
	}

	void set_label_id(const std::string& label, int label_id){
	_label_id[label] = label_id;
	_id_label[label_id] = label;
	}

	int get_label_id(const std::string& label){
	int label_id = 0;
	if(_label_id.find(label) == _label_id.end()){
	label_id = _label_id.size();
	set_label_id(label, label_id);
	}else{
	label_id = _label_id[label];
	}
	return label_id;
	}

	void read_document(char* filename){
	std::ifstream ifs;
	ifs.open(filename, std::ios::in);
	std::string line;
	while(getline(ifs, line)){
	// file format
	// label \t w_1:count \t w_2:count ...
	std::vector<std::string> elem = split_string(line, "\t");

	std::string label = elem.at(0);
	int label_id = get_label_id(label);
	_labels.push_back(label_id);

	std::vector<std::pair<int, double> > doc(elem.size() - 1);
	for(int i = 1; i < elem.size(); ++i){
	std::string e = elem.at(i);
	std::vector<std::string> ary = split_string(line, ":");
	std::string feature = ary.at(0);
	int feature_id = get_feature_id(feature);
	double count = atof((ary.at(1)).c_str());
	doc.at(i - 1) = std::make_pair(feature_id, count);
	}

	_documents.push_back(doc);
	}
	ifs.close();
	_N = _documents.size();
	_C = _label_id.size();
	_K = _feature_id.size();
	}

	void initialize_w(){
	for(int c = 0; c < _C; ++c){
	for(int k = 0; k < _K; ++k){
	double r = rand() / RAND_MAX;
	_w[std::make_pair(c, k)] = r;
	}
	}
	}

	// read w
	void set_w(char* filename){
	}

	double logsumexp(double x, double y, bool init_flag){
	if(init_flag){
	return y;
	}

	if(x == y){
	return x + log(2);
	}

	double vmin = std::min(x, y);
	double vmax = std::max(x, y);
	if(vmax > vmin + 50){
	return vmax;
	}else{
	return vmax + log(exp(vmin - vmax) + 1.0);
	}
	}

	double prod(const std::vector<std::pair<int, double> >& doc, int class_id){
	double score = 0.0;

	// each feature - value pair
	std::vector<std::pair<int, double> >::const_iterator it;
	for(it = doc.begin(); it != doc.end(); ++it){
	std::pair<int, double> key = *it;
	int feature_id = key.first;
	double val = key.second;
	score += val * _w[std::make_pair(class_id, feature_id)];
	}
	return score;
	}

	std::unordered_map<int, double> predict_document(const std::vector<std::pair<int, double> >& doc){
	std::unordered_map<int, double> scores;
	double sum = 0.0;

	for(int class_id = 0; class_id < _C; ++class_id){
	double val = prod(doc, class_id);
	scores[class_id] = val;
	sum = logsumexp(sum, val, class_id == 0);
	}

	// normalize
	for(int class_id = 0; class_id < _C; ++class_id){
	scores[class_id] = exp(scores[class_id] - sum);
	}

	return scores;
	}

	std::unordered_map<std::pair<int, int>, double, myhash, myeq> gradient_E(){
	// key: <class_id, feature_id>
	std::unordered_map<std::pair<int, int>, double, myhash, myeq> grad;
	std::vector<std::pair<int, double> >::iterator it;

	for(int doc_pos = 0; doc_pos < _N; ++doc_pos){
	std::vector<std::pair<int, double> > doc = _documents.at(doc_pos);
	std::unordered_map<int, double> scores = predict_document(doc);
	int label = _labels.at(doc_pos);

	for(int class_id = 0; class_id < _C; ++class_id){
	double t = label == class_id ? 1.0 : 0.0;
	double y = scores[class_id];
	double diff = (y - t);

	for(it = doc.begin(); it != doc.end(); ++it){
	std::pair<int, double> key = *it;
	int feature_id = key.first;
	double val = key.second;
	grad[std::make_pair(class_id, feature_id)] += (diff * val);
	}
	}
	}
	return grad;
	}

	void update_w(){
	std::unordered_map<std::pair<int, int>, double, myhash, myeq> grad = gradient_E();

	for(int class_id = 0; class_id < _C; ++class_id){
	for(int feature_id = 0; feature_id < _K; ++feature_id){
	std::pair<int, int> key = std::make_pair(class_id, feature_id);
	// w_new = w_old - eta * grad
	_w[key] = _w[key] - _eta * grad[key];
	}
	}
	}

	// converge?
	bool converge(){
	double val = cross_entropy_error();
	_cee.push_back(val);
	if(_cee.size() < 2) return false;
	return fabs(_cee.at(_cee.size() - 1) - _cee.at(_cee.size() - 2)) < _lim;
	}

	void train(){
	initialize_w();
	while(!converge()){
	update_w();
	predict();
	}
	}

	double cross_entropy_error(){
	double ret;
	for(int doc_id = 0; doc_id < _N; ++doc_id){
	int label = _labels.at(doc_id);
	std::vector<std::pair<int, double> > doc = _documents.at(doc_id);
	std::unordered_map<int, double> scores = predict_document(doc);
	for(int class_id = 0; class_id < _C; ++class_id){
	double t = label == class_id ? 1.0 : 0.0;
	double y = scores[class_id];
	ret += t * log(y);
	}
	}
	return -ret;
	}

	void predict(){
	double correct = 0.0;
	for(int doc_id = 0; doc_id < _N; ++doc_id){
	int answer = _labels.at(doc_id);
	std::vector<std::pair<int, double> > doc = _documents.at(doc_id);
	std::unordered_map<int, double> scores = predict_document(doc);
	int predict_id = -1;
	double max_score = -1.0;
	std::unordered_map<int, double>::iterator it;
	for(it = scores.begin(); it != scores.end(); ++it){
	int id = (*it).first;
	double val = (*it).second;
	if(max_score < val){
	predict_id = id;
	max_score = val;
	}
	}
	if(answer == predict_id) ++correct;
	}
	std::cout <<"accuracy:" << correct / _N << std::endl;
	}

	// output model
	void output_model(char* filename){
	// output model
	std::ostringstream oss;
	oss << filename << ".model";
	std::ofstream ofs;
	ofs.open((oss.str()).c_str());
	ofs.close();
	}

	void output_results(char* filename){
	// output results
	std::ostringstream oss;
	oss << filename << ".results";
	std::ofstream ofs;
	ofs.open((oss.str()).c_str());
	ofs.close();
	}

	private:
	// weights
	std::unordered_map<std::pair<int, int>, double, myhash, myeq> _w;
	// documents
	std::vector<std::vector<std::pair<int, double> > > _documents;
	// label
	std::vector<int> _labels;

	// feature dictionary
	std::unordered_map<std::string, int> _feature_id;
	std::unordered_map<int, std::string> _id_feature;
	// labels
	std::unordered_map<std::string, int> _label_id;
	std::unordered_map<int, std::string> _id_label;
	// cross entropy error
	std::vector<double> _cee;
	double _lim;

	// parameters
	double _eta;
	// size of classes
	int _C;
	// size of documents
	int _N;
	// size of features
	int _K;
	};

	int main(int argc, char** argv){
	char *filename = argv[1];
	MultiLogReg t(0.1);
	t.read_document(filename);
	t.train();
	}