Skip to content

Instantly share code, notes, and snippets.

@ybenjo
Created September 14, 2013 15:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ybenjo/6562777 to your computer and use it in GitHub Desktop.
Save ybenjo/6562777 to your computer and use it in GitHub Desktop.
multiclass logistic regression(bug)
#include <iostream>
#include <fstream>
#include <sstream>
#include <unordered_map>
#include <vector>
#include <string>
#include <algorithm>
typedef std::pair<int, int> key;
struct myeq : std::binary_function<std::pair<int, int>, std::pair<int, int>, bool>{
bool operator() (const std::pair<int, int> & x, const std::pair<int, int> & y) const{
return x.first == y.first && x.second == y.second;
}
};
struct myhash : std::unary_function<std::pair<int, int>, size_t>{
private:
const std::hash<int> h_int;
public:
myhash() : h_int() {}
size_t operator()(const std::pair<int, int> & p) const{
size_t seed = h_int(p.first);
return h_int(p.second) + 0x9e3779b9 + (seed<<6) + (seed>>2);
}
};
std::vector<std::string> split_string(std::string s, std::string c){
std::vector<std::string> ret;
for(int i = 0, n = 0; i <= s.length(); i = n + 1){
n = s.find_first_of(c, i);
if(n == std::string::npos) n = s.length();
std::string tmp = s.substr(i, n-i);
ret.push_back(tmp);
}
return ret;
}
class MultiLogReg{
public:
MultiLogReg(){
srand(0);
_lim = 0.001;
}
MultiLogReg(double eta = 0.0001){
_eta = eta;
}
void set_feature_id(const std::string& feature, int feature_id){
_feature_id[feature] = feature_id;
_id_feature[feature_id] = feature;
}
int get_feature_id(const std::string& feature){
int feature_id = 0;
if(_feature_id.find(feature) == _feature_id.end()){
feature_id = _feature_id.size();
set_feature_id(feature, feature_id);
}else{
_feature_id[feature] = feature_id;
}
return feature_id;
}
void set_label_id(const std::string& label, int label_id){
_label_id[label] = label_id;
_id_label[label_id] = label;
}
int get_label_id(const std::string& label){
int label_id = 0;
if(_label_id.find(label) == _label_id.end()){
label_id = _label_id.size();
set_label_id(label, label_id);
}else{
label_id = _label_id[label];
}
return label_id;
}
void read_document(char* filename){
std::ifstream ifs;
ifs.open(filename, std::ios::in);
std::string line;
while(getline(ifs, line)){
// file format
// label \t w_1:count \t w_2:count ...
std::vector<std::string> elem = split_string(line, "\t");
std::string label = elem.at(0);
int label_id = get_label_id(label);
_labels.push_back(label_id);
std::vector<std::pair<int, double> > doc(elem.size() - 1);
for(int i = 1; i < elem.size(); ++i){
std::string e = elem.at(i);
std::vector<std::string> ary = split_string(line, ":");
std::string feature = ary.at(0);
int feature_id = get_feature_id(feature);
double count = atof((ary.at(1)).c_str());
doc.at(i - 1) = std::make_pair(feature_id, count);
}
_documents.push_back(doc);
}
ifs.close();
_N = _documents.size();
_C = _label_id.size();
_K = _feature_id.size();
}
void initialize_w(){
for(int c = 0; c < _C; ++c){
for(int k = 0; k < _K; ++k){
double r = rand() / RAND_MAX;
_w[std::make_pair(c, k)] = r;
}
}
}
// read w
void set_w(char* filename){
}
double logsumexp(double x, double y, bool init_flag){
if(init_flag){
return y;
}
if(x == y){
return x + log(2);
}
double vmin = std::min(x, y);
double vmax = std::max(x, y);
if(vmax > vmin + 50){
return vmax;
}else{
return vmax + log(exp(vmin - vmax) + 1.0);
}
}
double prod(const std::vector<std::pair<int, double> >& doc, int class_id){
double score = 0.0;
// each feature - value pair
std::vector<std::pair<int, double> >::const_iterator it;
for(it = doc.begin(); it != doc.end(); ++it){
std::pair<int, double> key = *it;
int feature_id = key.first;
double val = key.second;
score += val * _w[std::make_pair(class_id, feature_id)];
}
return score;
}
std::unordered_map<int, double> predict_document(const std::vector<std::pair<int, double> >& doc){
std::unordered_map<int, double> scores;
double sum = 0.0;
for(int class_id = 0; class_id < _C; ++class_id){
double val = prod(doc, class_id);
scores[class_id] = val;
sum = logsumexp(sum, val, class_id == 0);
}
// normalize
for(int class_id = 0; class_id < _C; ++class_id){
scores[class_id] = exp(scores[class_id] - sum);
}
return scores;
}
std::unordered_map<std::pair<int, int>, double, myhash, myeq> gradient_E(){
// key: <class_id, feature_id>
std::unordered_map<std::pair<int, int>, double, myhash, myeq> grad;
std::vector<std::pair<int, double> >::iterator it;
for(int doc_pos = 0; doc_pos < _N; ++doc_pos){
std::vector<std::pair<int, double> > doc = _documents.at(doc_pos);
std::unordered_map<int, double> scores = predict_document(doc);
int label = _labels.at(doc_pos);
for(int class_id = 0; class_id < _C; ++class_id){
double t = label == class_id ? 1.0 : 0.0;
double y = scores[class_id];
double diff = (y - t);
for(it = doc.begin(); it != doc.end(); ++it){
std::pair<int, double> key = *it;
int feature_id = key.first;
double val = key.second;
grad[std::make_pair(class_id, feature_id)] += (diff * val);
}
}
}
return grad;
}
void update_w(){
std::unordered_map<std::pair<int, int>, double, myhash, myeq> grad = gradient_E();
for(int class_id = 0; class_id < _C; ++class_id){
for(int feature_id = 0; feature_id < _K; ++feature_id){
std::pair<int, int> key = std::make_pair(class_id, feature_id);
// w_new = w_old - eta * grad
_w[key] = _w[key] - _eta * grad[key];
}
}
}
// converge?
bool converge(){
double val = cross_entropy_error();
_cee.push_back(val);
if(_cee.size() < 2) return false;
return fabs(_cee.at(_cee.size() - 1) - _cee.at(_cee.size() - 2)) < _lim;
}
void train(){
initialize_w();
while(!converge()){
update_w();
predict();
}
}
double cross_entropy_error(){
double ret;
for(int doc_id = 0; doc_id < _N; ++doc_id){
int label = _labels.at(doc_id);
std::vector<std::pair<int, double> > doc = _documents.at(doc_id);
std::unordered_map<int, double> scores = predict_document(doc);
for(int class_id = 0; class_id < _C; ++class_id){
double t = label == class_id ? 1.0 : 0.0;
double y = scores[class_id];
ret += t * log(y);
}
}
return -ret;
}
void predict(){
double correct = 0.0;
for(int doc_id = 0; doc_id < _N; ++doc_id){
int answer = _labels.at(doc_id);
std::vector<std::pair<int, double> > doc = _documents.at(doc_id);
std::unordered_map<int, double> scores = predict_document(doc);
int predict_id = -1;
double max_score = -1.0;
std::unordered_map<int, double>::iterator it;
for(it = scores.begin(); it != scores.end(); ++it){
int id = (*it).first;
double val = (*it).second;
if(max_score < val){
predict_id = id;
max_score = val;
}
}
if(answer == predict_id) ++correct;
}
std::cout <<"accuracy:" << correct / _N << std::endl;
}
// output model
void output_model(char* filename){
// output model
std::ostringstream oss;
oss << filename << ".model";
std::ofstream ofs;
ofs.open((oss.str()).c_str());
ofs.close();
}
void output_results(char* filename){
// output results
std::ostringstream oss;
oss << filename << ".results";
std::ofstream ofs;
ofs.open((oss.str()).c_str());
ofs.close();
}
private:
// weights
std::unordered_map<std::pair<int, int>, double, myhash, myeq> _w;
// documents
std::vector<std::vector<std::pair<int, double> > > _documents;
// label
std::vector<int> _labels;
// feature dictionary
std::unordered_map<std::string, int> _feature_id;
std::unordered_map<int, std::string> _id_feature;
// labels
std::unordered_map<std::string, int> _label_id;
std::unordered_map<int, std::string> _id_label;
// cross entropy error
std::vector<double> _cee;
double _lim;
// parameters
double _eta;
// size of classes
int _C;
// size of documents
int _N;
// size of features
int _K;
};
int main(int argc, char** argv){
char *filename = argv[1];
MultiLogReg t(0.1);
t.read_document(filename);
t.train();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment