Created
September 14, 2013 15:10
-
-
Save ybenjo/6562777 to your computer and use it in GitHub Desktop.
multiclass logistic regression(bug)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <fstream> | |
#include <sstream> | |
#include <unordered_map> | |
#include <vector> | |
#include <string> | |
#include <algorithm> | |
typedef std::pair<int, int> key; | |
struct myeq : std::binary_function<std::pair<int, int>, std::pair<int, int>, bool>{ | |
bool operator() (const std::pair<int, int> & x, const std::pair<int, int> & y) const{ | |
return x.first == y.first && x.second == y.second; | |
} | |
}; | |
struct myhash : std::unary_function<std::pair<int, int>, size_t>{ | |
private: | |
const std::hash<int> h_int; | |
public: | |
myhash() : h_int() {} | |
size_t operator()(const std::pair<int, int> & p) const{ | |
size_t seed = h_int(p.first); | |
return h_int(p.second) + 0x9e3779b9 + (seed<<6) + (seed>>2); | |
} | |
}; | |
std::vector<std::string> split_string(std::string s, std::string c){ | |
std::vector<std::string> ret; | |
for(int i = 0, n = 0; i <= s.length(); i = n + 1){ | |
n = s.find_first_of(c, i); | |
if(n == std::string::npos) n = s.length(); | |
std::string tmp = s.substr(i, n-i); | |
ret.push_back(tmp); | |
} | |
return ret; | |
} | |
class MultiLogReg{ | |
public: | |
MultiLogReg(){ | |
srand(0); | |
_lim = 0.001; | |
} | |
MultiLogReg(double eta = 0.0001){ | |
_eta = eta; | |
} | |
void set_feature_id(const std::string& feature, int feature_id){ | |
_feature_id[feature] = feature_id; | |
_id_feature[feature_id] = feature; | |
} | |
int get_feature_id(const std::string& feature){ | |
int feature_id = 0; | |
if(_feature_id.find(feature) == _feature_id.end()){ | |
feature_id = _feature_id.size(); | |
set_feature_id(feature, feature_id); | |
}else{ | |
_feature_id[feature] = feature_id; | |
} | |
return feature_id; | |
} | |
void set_label_id(const std::string& label, int label_id){ | |
_label_id[label] = label_id; | |
_id_label[label_id] = label; | |
} | |
int get_label_id(const std::string& label){ | |
int label_id = 0; | |
if(_label_id.find(label) == _label_id.end()){ | |
label_id = _label_id.size(); | |
set_label_id(label, label_id); | |
}else{ | |
label_id = _label_id[label]; | |
} | |
return label_id; | |
} | |
void read_document(char* filename){ | |
std::ifstream ifs; | |
ifs.open(filename, std::ios::in); | |
std::string line; | |
while(getline(ifs, line)){ | |
// file format | |
// label \t w_1:count \t w_2:count ... | |
std::vector<std::string> elem = split_string(line, "\t"); | |
std::string label = elem.at(0); | |
int label_id = get_label_id(label); | |
_labels.push_back(label_id); | |
std::vector<std::pair<int, double> > doc(elem.size() - 1); | |
for(int i = 1; i < elem.size(); ++i){ | |
std::string e = elem.at(i); | |
std::vector<std::string> ary = split_string(line, ":"); | |
std::string feature = ary.at(0); | |
int feature_id = get_feature_id(feature); | |
double count = atof((ary.at(1)).c_str()); | |
doc.at(i - 1) = std::make_pair(feature_id, count); | |
} | |
_documents.push_back(doc); | |
} | |
ifs.close(); | |
_N = _documents.size(); | |
_C = _label_id.size(); | |
_K = _feature_id.size(); | |
} | |
void initialize_w(){ | |
for(int c = 0; c < _C; ++c){ | |
for(int k = 0; k < _K; ++k){ | |
double r = rand() / RAND_MAX; | |
_w[std::make_pair(c, k)] = r; | |
} | |
} | |
} | |
// read w | |
void set_w(char* filename){ | |
} | |
double logsumexp(double x, double y, bool init_flag){ | |
if(init_flag){ | |
return y; | |
} | |
if(x == y){ | |
return x + log(2); | |
} | |
double vmin = std::min(x, y); | |
double vmax = std::max(x, y); | |
if(vmax > vmin + 50){ | |
return vmax; | |
}else{ | |
return vmax + log(exp(vmin - vmax) + 1.0); | |
} | |
} | |
double prod(const std::vector<std::pair<int, double> >& doc, int class_id){ | |
double score = 0.0; | |
// each feature - value pair | |
std::vector<std::pair<int, double> >::const_iterator it; | |
for(it = doc.begin(); it != doc.end(); ++it){ | |
std::pair<int, double> key = *it; | |
int feature_id = key.first; | |
double val = key.second; | |
score += val * _w[std::make_pair(class_id, feature_id)]; | |
} | |
return score; | |
} | |
std::unordered_map<int, double> predict_document(const std::vector<std::pair<int, double> >& doc){ | |
std::unordered_map<int, double> scores; | |
double sum = 0.0; | |
for(int class_id = 0; class_id < _C; ++class_id){ | |
double val = prod(doc, class_id); | |
scores[class_id] = val; | |
sum = logsumexp(sum, val, class_id == 0); | |
} | |
// normalize | |
for(int class_id = 0; class_id < _C; ++class_id){ | |
scores[class_id] = exp(scores[class_id] - sum); | |
} | |
return scores; | |
} | |
std::unordered_map<std::pair<int, int>, double, myhash, myeq> gradient_E(){ | |
// key: <class_id, feature_id> | |
std::unordered_map<std::pair<int, int>, double, myhash, myeq> grad; | |
std::vector<std::pair<int, double> >::iterator it; | |
for(int doc_pos = 0; doc_pos < _N; ++doc_pos){ | |
std::vector<std::pair<int, double> > doc = _documents.at(doc_pos); | |
std::unordered_map<int, double> scores = predict_document(doc); | |
int label = _labels.at(doc_pos); | |
for(int class_id = 0; class_id < _C; ++class_id){ | |
double t = label == class_id ? 1.0 : 0.0; | |
double y = scores[class_id]; | |
double diff = (y - t); | |
for(it = doc.begin(); it != doc.end(); ++it){ | |
std::pair<int, double> key = *it; | |
int feature_id = key.first; | |
double val = key.second; | |
grad[std::make_pair(class_id, feature_id)] += (diff * val); | |
} | |
} | |
} | |
return grad; | |
} | |
void update_w(){ | |
std::unordered_map<std::pair<int, int>, double, myhash, myeq> grad = gradient_E(); | |
for(int class_id = 0; class_id < _C; ++class_id){ | |
for(int feature_id = 0; feature_id < _K; ++feature_id){ | |
std::pair<int, int> key = std::make_pair(class_id, feature_id); | |
// w_new = w_old - eta * grad | |
_w[key] = _w[key] - _eta * grad[key]; | |
} | |
} | |
} | |
// converge? | |
bool converge(){ | |
double val = cross_entropy_error(); | |
_cee.push_back(val); | |
if(_cee.size() < 2) return false; | |
return fabs(_cee.at(_cee.size() - 1) - _cee.at(_cee.size() - 2)) < _lim; | |
} | |
void train(){ | |
initialize_w(); | |
while(!converge()){ | |
update_w(); | |
predict(); | |
} | |
} | |
double cross_entropy_error(){ | |
double ret; | |
for(int doc_id = 0; doc_id < _N; ++doc_id){ | |
int label = _labels.at(doc_id); | |
std::vector<std::pair<int, double> > doc = _documents.at(doc_id); | |
std::unordered_map<int, double> scores = predict_document(doc); | |
for(int class_id = 0; class_id < _C; ++class_id){ | |
double t = label == class_id ? 1.0 : 0.0; | |
double y = scores[class_id]; | |
ret += t * log(y); | |
} | |
} | |
return -ret; | |
} | |
void predict(){ | |
double correct = 0.0; | |
for(int doc_id = 0; doc_id < _N; ++doc_id){ | |
int answer = _labels.at(doc_id); | |
std::vector<std::pair<int, double> > doc = _documents.at(doc_id); | |
std::unordered_map<int, double> scores = predict_document(doc); | |
int predict_id = -1; | |
double max_score = -1.0; | |
std::unordered_map<int, double>::iterator it; | |
for(it = scores.begin(); it != scores.end(); ++it){ | |
int id = (*it).first; | |
double val = (*it).second; | |
if(max_score < val){ | |
predict_id = id; | |
max_score = val; | |
} | |
} | |
if(answer == predict_id) ++correct; | |
} | |
std::cout <<"accuracy:" << correct / _N << std::endl; | |
} | |
// output model | |
void output_model(char* filename){ | |
// output model | |
std::ostringstream oss; | |
oss << filename << ".model"; | |
std::ofstream ofs; | |
ofs.open((oss.str()).c_str()); | |
ofs.close(); | |
} | |
void output_results(char* filename){ | |
// output results | |
std::ostringstream oss; | |
oss << filename << ".results"; | |
std::ofstream ofs; | |
ofs.open((oss.str()).c_str()); | |
ofs.close(); | |
} | |
private: | |
// weights | |
std::unordered_map<std::pair<int, int>, double, myhash, myeq> _w; | |
// documents | |
std::vector<std::vector<std::pair<int, double> > > _documents; | |
// label | |
std::vector<int> _labels; | |
// feature dictionary | |
std::unordered_map<std::string, int> _feature_id; | |
std::unordered_map<int, std::string> _id_feature; | |
// labels | |
std::unordered_map<std::string, int> _label_id; | |
std::unordered_map<int, std::string> _id_label; | |
// cross entropy error | |
std::vector<double> _cee; | |
double _lim; | |
// parameters | |
double _eta; | |
// size of classes | |
int _C; | |
// size of documents | |
int _N; | |
// size of features | |
int _K; | |
}; | |
int main(int argc, char** argv){ | |
char *filename = argv[1]; | |
MultiLogReg t(0.1); | |
t.read_document(filename); | |
t.train(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment