Created
October 6, 2019 08:40
-
-
Save lp6m/29449f7690a76d331c41ab9248b183a9 to your computer and use it in GitHub Desktop.
bdeu_score.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*example dataset | |
4 3 | |
A B C D | |
A B | |
B C | |
D C | |
10 | |
1 1 0 1 | |
0 2 1 1 | |
2 1 0 0 | |
1 1 0 2 | |
1 1 0 1 | |
0 0 1 2 | |
0 2 1 1 | |
1 0 1 1 | |
1 1 2 1 | |
1 2 0 1 | |
*/ | |
#include <bits/stdc++.h> | |
using namespace std; | |
#define FOR(i,a,b) for(int i=(int)(a);i<(int)(b);i++) | |
#define REP(i,n) for(int i=0;i<(int)(n);i++) | |
void showVI(vector<int> data){ | |
cout << "["; | |
REP(i, data.size()){ | |
cout << data[i]; | |
if(i != data.size() - 1) cout << ", "; | |
} | |
cout << "]"; | |
} | |
void showVVI(vector<vector<int>> data){ | |
cout << "["; | |
REP(i, data.size()){ | |
cout << "["; | |
REP(j, data[i].size()){ | |
cout << data[i][j]; | |
if(j != data[i].size() - 1) cout << ", "; | |
} | |
cout << "]"; | |
if(i != data.size() - 1) cout << ", "; | |
} | |
cout << "]" << endl; | |
} | |
class Node{ | |
public: | |
string name; | |
vector<Node> parents; | |
Node(); | |
Node(string); | |
void add_parents(Node); | |
}; | |
Node::Node(){} | |
Node::Node(string m_name): name(m_name){ | |
} | |
void Node::add_parents(Node node){ | |
this->parents.push_back(node); | |
} | |
class BayesianNetwork{ | |
public: | |
vector<string> column_names; | |
BayesianNetwork(); | |
BayesianNetwork(vector<pair<string, string>>); | |
void add_edge(string, string); | |
void set_graph(vector<pair<string, string>> arrowList); | |
void show_nodes(); | |
vector<string> get_node_names(); | |
vector<Node> get_nodes(); | |
vector<Node> get_parent_nodes(string); | |
vector<Node> get_parent_nodes(Node); | |
void save_as_dotscript(string); | |
private: | |
map<string, Node> node_dic; | |
void check_graph_structure(); | |
string to_dot_script(); | |
}; | |
BayesianNetwork::BayesianNetwork(){ | |
} | |
BayesianNetwork::BayesianNetwork(vector<pair<string, string>> arrowList){ | |
set_graph(arrowList); | |
} | |
void BayesianNetwork::set_graph(vector<pair<string, string>> edgeList){ | |
for(auto edge :edgeList){ | |
this->add_edge(edge.first, edge.second); | |
} | |
} | |
void BayesianNetwork::add_edge(string from_name, string to_name){ | |
if(this->node_dic.count(from_name) == 0) this->node_dic[from_name] = Node(from_name); | |
if(this->node_dic.count(to_name) == 0) this->node_dic[to_name] = Node(to_name); | |
this->node_dic[to_name].add_parents(this->node_dic[from_name]); | |
} | |
void BayesianNetwork::check_graph_structure(){ | |
//TODO | |
} | |
void BayesianNetwork::show_nodes(){ | |
for(auto itr = this->node_dic.begin(); itr != this->node_dic.end(); itr++){ | |
cout << itr->first << ", "; | |
} | |
cout << endl; | |
} | |
vector<string> BayesianNetwork::get_node_names(){ | |
vector<string> names; | |
for(auto itr = this->node_dic.begin(); itr != this->node_dic.end(); itr++){ | |
names.push_back(itr->first); | |
} | |
return names; | |
} | |
vector<Node> BayesianNetwork::get_nodes(){ | |
vector<Node> nodes; | |
for(auto itr = this->node_dic.begin(); itr != this->node_dic.end(); itr++){ | |
nodes.push_back(itr->second); | |
} | |
return nodes; | |
} | |
vector<Node> BayesianNetwork::get_parent_nodes(Node child_node){ | |
return child_node.parents; | |
} | |
vector<Node> BayesianNetwork::get_parent_nodes(string child_name){ | |
if(this->node_dic.count(child_name) == 0) throw "Exception: network does not have specified child node: " + child_name; | |
Node node = this->node_dic[child_name]; | |
return BayesianNetwork::get_parent_nodes(node); | |
} | |
string BayesianNetwork::to_dot_script(){ | |
//convert network structure to DOT script for graphviz | |
string s = "digraph{\n"; | |
for(auto itr = this->node_dic.begin(); itr != this->node_dic.end(); itr++){ | |
Node node = itr->second; | |
for(Node parent: node.parents){ | |
s += " " + parent.name + " -> " + node.name + "\n"; | |
} | |
} | |
s += "}\n"; | |
return s; | |
} | |
void BayesianNetwork::save_as_dotscript(string filename){ | |
ofstream outputfile(filename); | |
outputfile << this->to_dot_script(); | |
outputfile.close(); | |
} | |
class Column{ | |
public: | |
Column(string); | |
string name; | |
int get_kind_num(); | |
void update_kind_num(int); | |
vector<int> get_kinds(); | |
private: | |
set<int> value_kind_set; //number of kind of data | |
}; | |
Column::Column(string m_name): name(m_name){ | |
} | |
vector<int> Column::get_kinds(){ | |
return vector<int>(this->value_kind_set.begin(), this->value_kind_set.end()); | |
} | |
int Column::get_kind_num(){ | |
return this->value_kind_set.size(); | |
} | |
void Column::update_kind_num(int val){ | |
this->value_kind_set.insert(val); | |
} | |
class DataSet{ | |
public: | |
vector<vector<int>> data; | |
void set_data(vector<string>, vector<vector<int>>); | |
void set_data_fromfile(string); | |
int get_column_index_from_name(string); | |
vector<Column> columns; | |
private: | |
void add_data(vector<int>); | |
void set_column(vector<string>); | |
map<string, int> column_index_of_name; | |
}; | |
void DataSet::set_column(vector<string> columns){ | |
int i = 0; | |
for(string name: columns){ | |
this->columns.push_back(Column(name)); | |
this->column_index_of_name[name] = i++; | |
} | |
} | |
int DataSet::get_column_index_from_name(string name){ | |
return this->column_index_of_name[name]; | |
} | |
void DataSet::add_data(vector<int> oneline){ | |
data.push_back(oneline); | |
REP(i, oneline.size()){ | |
int val = oneline[i]; | |
this->columns[i].update_kind_num(val); | |
} | |
} | |
void DataSet::set_data(vector<string> column_names, vector<vector<int>> data){ | |
this->set_column(column_names); | |
for(vector<int> oneline : data){ | |
this->add_data(oneline); | |
} | |
} | |
void DataSet::set_data_fromfile(string filename){ | |
//TODO | |
} | |
class BdeuScore{ | |
public: | |
BdeuScore(BayesianNetwork, DataSet); | |
double get_score(); | |
private: | |
BayesianNetwork model; | |
DataSet dataset; | |
double get_local_score(Node variable, vector<Node> parents); | |
struct StateCount{ | |
int parents_candidates_num; | |
int child_candidates_num; | |
map<vector<int>, pair<int, map<int, int>>> countmap; | |
StateCount(int pnum, int cnum, map<vector<int>, pair<int, map<int, int>>> countmap){ | |
this->parents_candidates_num = pnum; | |
this->child_candidates_num = cnum; | |
this->countmap = countmap; | |
} | |
}; | |
BdeuScore::StateCount get_state_counts(Node variable, vector<Node> parents); | |
}; | |
BdeuScore::BdeuScore(BayesianNetwork _model, DataSet _dataset): model(_model), dataset(_dataset){ | |
} | |
double BdeuScore::get_score(){ | |
double score = 0; | |
auto nodes = this->model.get_nodes(); | |
for(Node node: nodes){ | |
score += this->get_local_score(node, node.parents); | |
} | |
return score; | |
} | |
double BdeuScore::get_local_score(Node variable, vector<Node> parents){ | |
const double ess = 10.0;//equivalent_sample_size default value refers to pgmpy implementation | |
double local_score = 0; | |
StateCount statecount = this->get_state_counts(variable, parents); | |
double nik_prime = (double)ess/(double)statecount.parents_candidates_num; | |
double nijk_prime = nik_prime/(double)statecount.child_candidates_num; | |
//does not iterate all combinations. only iterate in existing combinations | |
for(auto pitr = statecount.countmap.begin(); pitr != statecount.countmap.end(); pitr++){ | |
double nik = (double)(pitr->second.first); | |
auto childmap = pitr->second.second; | |
if(nik > 0) local_score += lgamma(nik_prime) - lgamma(nik + nik_prime); | |
for(auto citr = childmap.begin(); citr != childmap.end(); citr++){ | |
double nijk = (double)(citr->second); | |
if(nijk > 0) local_score += lgamma(nijk_prime + nijk) - lgamma(nijk_prime); | |
} | |
} | |
return local_score; | |
} | |
BdeuScore::StateCount BdeuScore::get_state_counts(Node variable, vector<Node> parents){ | |
DataSet dataset = this->dataset; | |
if(parents.size() > 0){ | |
int parents_candidates_num = 1; | |
vector<int> parent_indices; | |
for(Node parent_node: parents){ | |
string parent_name = parent_node.name; | |
int column_index = dataset.get_column_index_from_name(parent_name); | |
parent_indices.push_back(column_index); | |
parents_candidates_num *= dataset.columns[column_index].get_kind_num(); | |
} | |
int child_column_index = dataset.get_column_index_from_name(variable.name); | |
int child_candidates_num = dataset.columns[child_column_index].get_kind_num(); | |
map<vector<int>, pair<int, map<int, int>>> family_states_countmap; | |
for(vector<int> oneline: dataset.data){ | |
vector<int> ar; | |
for(int index: parent_indices){ | |
ar.push_back(oneline[index]); | |
} | |
if(family_states_countmap.count(ar) == 0){ | |
map<int,int> m; | |
family_states_countmap[ar] = make_pair(0, m); | |
} | |
family_states_countmap[ar].first++; | |
int child_val = oneline[child_column_index]; | |
if(family_states_countmap[ar].second.count(child_val) == 0){ | |
family_states_countmap[ar].second[child_val] = 0; | |
} | |
family_states_countmap[ar].second[child_val]++; | |
} | |
return BdeuScore::StateCount(parents_candidates_num, child_candidates_num, family_states_countmap); | |
}else{ | |
//leaf = no parent | |
int child_column_index = dataset.get_column_index_from_name(variable.name); | |
int child_candidates_num = dataset.columns[child_column_index].get_kind_num(); | |
map<int,int> child_countmap; | |
for(vector<int> oneline: dataset.data){ | |
int val = oneline[child_column_index]; | |
if(child_countmap.count(val) == 0) child_countmap[val] = 0; | |
child_countmap[val]++; | |
} | |
vector<int> dummy_parent = vector<int>(1); | |
map<vector<int>, pair<int, map<int, int>>> states_countmap; | |
states_countmap[dummy_parent] = make_pair(dataset.data.size(), child_countmap); | |
return BdeuScore::StateCount(1, child_candidates_num, states_countmap); | |
} | |
} | |
int main(){ | |
BayesianNetwork bn; | |
vector<string> node_ames; | |
int node_num, edge_num; | |
cin >> node_num >> edge_num; | |
vector<string> column_names; | |
REP(i, node_num){ | |
string column_name; | |
cin >> column_name; | |
column_names.push_back(column_name); | |
} | |
REP(i, edge_num){ | |
string from, to; | |
cin >> from >> to; | |
bn.add_edge(from, to); | |
} | |
bn.show_nodes(); | |
bn.save_as_dotscript("graph.dot"); | |
int data_num; | |
cin >> data_num; | |
vector<vector<int>> dataarr; | |
REP(i, data_num){ | |
vector<int> oneline; | |
REP(j, column_names.size()){ | |
int val; | |
cin >> val; | |
oneline.push_back(val); | |
} | |
dataarr.push_back(oneline); | |
} | |
DataSet dataset; | |
dataset.set_data(column_names, dataarr); | |
REP(i, column_names.size()){ | |
cout << dataset.columns[i].name << " : "; | |
cout << dataset.columns[i].get_kind_num() << endl; | |
} | |
BdeuScore bdeuscore(bn, dataset); | |
double score = bdeuscore.get_score(); | |
cout << score << endl; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment