Skip to content

Instantly share code, notes, and snippets.

@dvsseed
Created January 26, 2022 13:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dvsseed/a4dbadada3a26212ad4ad052e9c83f36 to your computer and use it in GitHub Desktop.
Save dvsseed/a4dbadada3a26212ad4ad052e9c83f36 to your computer and use it in GitHub Desktop.
To calculate the Conditional entropy and normalized the ResNet's Flatten layer values.
#include <bits/stdc++.h>
#include <sys/time.h>
#include <iostream>
#include <ctime>
#include <fstream>
#include <vector>
#include <sstream>
#include <map>
#include <algorithm>
#include <cmath>
#include <functional>
#include <string>
#include <array>
#include <iomanip>
// #include "log.h"
using namespace std;
using std::cout;
using std::cerr;
using std::endl;
using std::string;
using std::ifstream;
using std::ostringstream;
using std::istringstream;
using std::array;
using std::find;
using std::distance;
using std::vector;
// Writing a Log file
inline string getCurrentDateTime( string s ){
time_t now = time(0);
struct tm tstruct;
char buf[80];
tstruct = *localtime(&now);
if(s == "now")
strftime(buf, sizeof(buf), "%Y-%m-%d %X", &tstruct);
else if(s == "date")
strftime(buf, sizeof(buf), "%Y-%m-%d", &tstruct);
return string(buf);
};
inline void Logger( string logMsg ){
string filePath = "./log/log_" + getCurrentDateTime("date") + ".txt";
string now = getCurrentDateTime("now");
ofstream ofs( filePath.c_str(), std::ios_base::out | std::ios_base::app );
ofs << now << '\t' << logMsg << '\n';
ofs.close();
}
// Usage: Logger("This is log message"); Writes a file (or appends existing file)
// Get current date/time, format is YYYY-MM-DD.HH:mm:ss
const std::string currentDateTime() {
time_t now = time(0);
struct tm tstruct;
char buf[80];
tstruct = *localtime(&now);
// Visit http://en.cppreference.com/w/cpp/chrono/c/strftime
// for more information about date/time format
// strftime(buf, sizeof(buf), "%Y-%m-%d.%X", &tstruct);
strftime(buf, sizeof(buf), "%Y%m%d%H%M%S", &tstruct); // change to YYYYMMDDHHMMSS
return buf;
}
// Write to csv [feature, labels]
void writeToCSV(const std::string& path, const std::string& title,
std::vector<std::string>& vecOfXs, std::vector<std::string>& vecOfLabels) {
char delimiter = ',';
ofstream outFile;
outFile.open(path);
outFile << title << endl; // header
for (int i = 0; i < vecOfXs.size(); i++) {
outFile << vecOfXs[i] << delimiter
<< vecOfLabels[i] << std::endl;
}
outFile.close();
}
// Write to csv [no, entropy]
void writeToCSV1(const std::string& path, const std::string& title, std::vector<std::string>& vecOfXs) {
char delimiter = ',';
// string ofilename("./flatten_entropy_" + currentDateTime() + ".csv");
ofstream outFile;
// outFile.open(ofilename, std::ios_base::app); // seek to the end of stream before each write
outFile.open(path); // seek to the end of stream before each write
outFile << title << endl; // header
for (int i = 0; i < vecOfXs.size(); i++) {
outFile << i << delimiter
<< vecOfXs[i] << endl;
}
outFile.close();
}
// Print contents of an array
template<typename T, size_t n>
void print_array(T const(& arr)[n]) {
for (size_t i = 0; i < n; i++) {
std::cout << arr[i] << ' ';
}
}
// Print the contents of vector
template<typename T>
void print_vector(T & vecOfElements, std::string delimeter = ", ") {
for (auto elem : vecOfElements)
std::cout << elem << delimeter;
std::cout << std::endl;
}
string readFileIntoString(const string& path) {
auto ss = ostringstream{};
ss.clear();
ifstream input_file(path);
if (!input_file.is_open()) {
cerr << "Could not open the file - '" << path << "'" << endl;
exit(EXIT_FAILURE);
}
ss << input_file.rdbuf();
return ss.str();
}
std::vector<string> normalized(std::vector<string> features) {
// convert string to float
float feature_first = std::stof(features[0]);
float smallest_element = feature_first; // first element is the smallest one
float largest_element = feature_first; // first element is the biggest one
std::string feature_value = "";
float feature_float = 0.0f;
std::vector<string> features_out;
for (int i = 1; i < features.size(); i++) { // start iterating from the second element
feature_float = std::stof(features[i]);
if(feature_float < smallest_element) {
smallest_element = feature_float;
}
if(feature_float > largest_element) {
largest_element = feature_float;
}
}
for (int i = 0; i < features.size(); i++) { // start iterating from the second element
feature_float = std::stof(features[i]);
// features_out.push_back(std::to_string((feature_float - smallest_element) / (largest_element - smallest_element)));
feature_value = std::to_string((feature_float - smallest_element) / (largest_element - smallest_element));
if (feature_value == "-nan")
features_out.push_back("0.0");
else
features_out.push_back(feature_value);
}
return features_out;
}
/*
* Generic function to find duplicates elements in vector.
* It adds the duplicate elements and their duplication count in given map countMap
*/
template <typename T>
void findDuplicates(std::vector<T> & vecOfElements, std::map<T, int> & countMap) {
// Iterate over the vector and store the frequency of each element in map
for (auto & elem : vecOfElements) {
auto result = countMap.insert(std::pair<std::string, int>(elem, 1));
if (result.second == false)
result.first->second++;
}
// Remove the elements from Map which has 1 frequency count
for (auto it = countMap.begin(); it != countMap.end();) {
if (it->second == 1)
it = countMap.erase(it);
else
it++;
}
}
template <typename T>
void findDuplicates_1(std::vector<T> & vecOfElements, std::map<T, int> & countMap) {
// Iterate over the vector and store the frequency of each element in map
for (auto & elem : vecOfElements) {
auto result = countMap.insert(std::pair<std::string, int>(elem, 1));
if (result.second == false)
result.first->second++;
}
// DON'T DO THIS => remove the elements from Map which has 1 frequency count
}
// Return index of the element using std::find()
template <typename T>
int searchResult(std::vector<T> arr, T keyword, int elem) {
std::vector<std::string>::iterator it;
it = std::find(arr.begin() + elem, arr.end(), keyword); // specify elements relative to the start
if (it != arr.end())
return (it - arr.begin());
else
return -1;
}
int main()
{
// for debug
// char debug = '0'; // 1: debug on, 0: turn off
// making log file
// Logger("This is a test\n");
// Logger("This is another test\n");
// Logger("\n--------------------\n");
// Logger("This is the third test\n");
// Logger("\n--------------------\n");
// exit(EXIT_SUCCESS);
/* The function gettimeofday() can get the time as
well as timezone.
int gettimeofday(struct timeval *tv, struct timezone *tz);
The tv argument is a struct timeval and gives the
number of seconds and micro seconds since the Epoch.
struct timeval {
time_t tv_sec; // seconds
suseconds_t tv_usec; // microseconds
}; */
struct timeval start, end;
// start timer.
gettimeofday(&start, NULL);
// unsync the I/O of C and C++.
ios_base::sync_with_stdio(false);
// string testFilename("./flatten_entropy_" + currentDateTime() + ".csv");
// std::cout << "Current DateTime: " << currentDateTime() << std::endl;
// std::cout << "Filename DateTime: " << testFilename << std::endl;
// return EXIT_SUCCESS;
// --> start execution -->
// read a file of csv
string filename("flatten_labels_20220125214627_GC.csv"); // <<<=== must modify here
string file_contents("");
std::map<int, std::vector<string>> csv_contents; // pairs: <key, value>
char delimiter = ',';
file_contents = readFileIntoString(filename);
istringstream sstream(file_contents);
std::vector<std::string> items;
// std::vector<std::string> indexes;
// std::vector<std::string> subindexes;
// std::vector<std::string> x0s;
// std::vector<std::string> x0s_out;
std::vector<std::string> labels;
// std::vector<int> labels; // label: float->int
string record(""); // row
int counter = 0; // total of rows
std::getline(sstream, record); // delete header = the first line
istringstream line(record);
while (std::getline(line, record, delimiter)) {
counter += 1; // to count columns
}
// std::cout << "total of column: " << counter << std::endl;
// exit(EXIT_SUCCESS);
int end_column = counter - 1; // for loop ending columns: 30978
counter = 0;
// >-> from feature's fields(columns) range: from 0 to 30976
// string x0; // column[2]
// string label; // column[30979]
// int counter = 0; // total of rows
// int column_index = 0; // total of columns
// from csv file to vector, map
while (std::getline(sstream, record)) {
istringstream line(record);
while (std::getline(line, record, delimiter)) {
items.push_back(record);
}
csv_contents[counter] = items;
items.clear();
counter += 1;
/*
std::getline(line, record, delimiter); // col[0]: index
indexes.push_back(record);
std::getline(line, record, delimiter); // col[1]: sub-index
subindexes.push_back(record);
for (int i = 0; i < (column_index + 1); i++) { // features: from no#0
std::getline(line, record, delimiter); // col[2]: x0
}
x0s.push_back(record); // to store the x0's vector
// std::getline(line, record, delimiter); // col[3]: x1
// x1s.push_back(record);
// only execute one time for labels
if (column_index == 0) {
for (int i = 0; i < (30975 - column_index); i++) { // features: to no#30977
std::getline(line, record, delimiter); // col[n]: xn
}
std::getline(line, record, delimiter); // col[30978]: label
labels.push_back(record.replace(record.find(".0"), 2, "")); // label: float->int
// string replace_str;
// replace_str = record.replace(record.find(".0"), 2, "");
// cout << "==> " << replace_str << ", type: " << typeid(replace_str).name() << endl;
// string replace_str;
// replace_str = record.replace(record.find(".0"), 2, "");
// labels.push_back(replace_str); // label: float->int
}
*/
}
file_contents = ""; // erase csv file
// cout << "Row counter: " << counter << " lines." << endl;
// cout << "CSV contents size: " << csv_contents.size() << endl;
int start_column = 2; // for loop starting columns: 2
// if (debug == '1')
// start_column = 12; // for loop starting columns: 13
// else
// start_column = 2; // for loop starting columns: 2
// int end_column = 30978; // for loop ending columns: 30978
std::vector<std::string> result_entropy; // to store the final entropy calculation
for (int x_column = start_column; x_column < end_column; x_column++) {
// int counter_col = 0;
// int x_column = 2; // feature-x number, column: [2..30977]
std::vector<std::string> x0s; // feature-x vector
std::vector<std::string> x0s_out; // feature-x normalized vector
auto it = csv_contents.begin();
// Iterate through the map
while(it != csv_contents.end())
{
// break condition
// if (counter_col == 1)
// break;
// std::cout << std::endl << "key: " << it->first // int's key -> row number
// << ':';
// << it->second // string's value -> row contents(features + label)
// int counter_item = 0;
// for (const auto &item : it->second) {
// // std::cout << item << " ";
// counter_item += 1; // col[0..30978] = 30979
// }
// std::cout << std::endl << "items: " << counter_item << std::endl;
// std::cout << std::endl << "items[0]: " << it->second[0] << std::endl; // col[0]: index
// std::cout << std::endl << "items[1]: " << it->second[1] << std::endl; // col[1]: sub-index
// std::cout << std::endl << "items[2]: " << it->second[2] << std::endl; // col[2]: x0
// std::cout << std::endl << "items[30978]: " << it->second[30978] << std::endl; // col[30978]: label
x0s.push_back(it->second[x_column]); // to store the x0's vector
if (x_column == start_column) { // to insert label's vector just once time
string replace_str("");
replace_str = it->second[end_column]; // col[30978]=label
replace_str = replace_str.replace(replace_str.find(".0"), 2, ""); // to delete .0
labels.push_back(replace_str); // to store the label's vector
}
// counter_col += 1;
// Go to next entry in map
it++;
}
// x_column += 1; // shift a column (feature-x)
// cout << "x0s size: " << x0s.size() << endl;
// cout << "labels size: " << labels.size() << endl;
// print_vector(labels);
// column_index += 1; // to skip the next features-x(n)
// size_t x0_size = x0s.size();
// cout << "x0's size: " << x0_size << endl;
// print x0's values
// int counter_x0 = 0;
// for (const auto &item : x0s) {
// std::cout << item << " ";
// counter_x0 += 1;
// }
// cout << endl << "counter x0: " << counter_x0 << endl;
// print label's values
// int counter_label = 0;
// for (const auto &item : labels) {
// std::cout << item << " ";
// counter_label += 1;
// }
// cout << endl << "counter label: " << counter_label << endl;
// normalized -> x0
// convert string to float
// float x0_first = std::stof(x0s[0]);
// float smallest_element = x0_first; // first element is the smallest one
// float largest_element = x0_first; // first element is the biggest one
// float x0_f = 0;
// for (int i = 1; i < x0s.size(); i++) { // start iterating from the second element
// x0_f = std::stof(x0s[i]);
// if(x0_f < smallest_element) {
// smallest_element = x0_f;
// }
// if(x0_f > largest_element) {
// largest_element = x0_f;
// }
// }
// for (int i = 0; i < x0s.size(); i++) { // start iterating from the second element
// x0_f = std::stof(x0s[i]);
// x0s_out.push_back(std::to_string((x0_f - smallest_element) / (largest_element - smallest_element)));
// }
/*
// Joint entropy -> [label]
// H(Y) = -Summation p(y) * log(p(y))
// array to store 10 classes: [0..9]
float classes[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
// cout << "before classes: ";
// print_array(classes);
// std::cout << endl;
int array_size = sizeof classes / sizeof classes[0];
for (int i = 0; i < labels.size(); i++) {
for (int c = 0; c < 10; c++) {
if (std::stoi(labels[i]) == c) {
classes[c] += 1; // for 10 classes
}
}
}
// cout << "after classes: ";
// print_array(classes);
// std::cout << endl;
float probability = 0.0f;
float entropy_label = 0.0f;
for (int i = 0; i < array_size; i++) {
probability = classes[i] / labels.size();
entropy_label = entropy_label - (probability * log(probability)); // log(base e)
// cout << probability << ", " << entropy_label << endl;
}
cout << "label of joint entropy: " << entropy_label << endl;
*/
// features -> normalized by function
// x0s_out.clear(); // erase all elements for x0s out
x0s_out = normalized(x0s);
// print_vector(x0s_out);
x0s.clear(); // erase all elements for x0s
// write to csv
// ofstream outFile;
// outFile.open("./x13_label.csv");
// outFile << "x13,label" << endl; // header
// for (int i = 0; i < x0s_out.size(); i++) {
// outFile << x0s_out[i] << delimiter
// << labels[i] << endl;
// }
// outFile.close();
// return EXIT_SUCCESS;
// cout << "index: " << column_index << ", x0's size: " << x0s_out.size() << endl;
// cout << "before x0s: " << x0s.size() << endl;
// std::vector<std::string, allocator<std::string>>::iterator begin = x0s.begin(), end = x0s.end();
// x0s.erase(begin, end);
// cout << "after x0s: " << x0s.size() << endl;
// copy string vector to float vector
// float* x_out = new float[x0s_out.size()];
// memcpy(x_out, &x0s_out[0], sizeof(float) * x0s_out.size());
// print maximum element
// string x0s_max = *std::max_element(x0s_out.begin(), x0s_out.end());
// std::cout << "x0's max element: " << x0s_max << endl;
// print minimum element
// string x0s_min = *std::min_element(x0s_out.begin(), x0s_out.end());
// std::cout << "x0's min element: " << x0s_min << endl;
// Conditional entropy -> [features, labels]
// H(Y|X) = -Summation p(x)*H(Y|X=x)
/*
* Finding duplicates in vector using generic function
*/
// Vector of strings
std::vector<std::string> vecOfStings = x0s_out;
x0s_out.clear(); // erase all elements
std::map<std::string, int> duplicateElements;
// Get the duplicate elements in vector(features-x0)
findDuplicates(vecOfStings, duplicateElements);
// std::cout << "Duplicate elements and their duplication count => (x0):" << std::endl;
std::vector<int> x0_duplicate_elements; // feature-x0 duplicate elements
std::vector<int> label_duplicate_elements; // label duplicate elements
for (auto & elem : duplicateElements) {
int limit_value = 1; // duplicate elements > 1
// if (debug == '1')
// limit_value = 4; // duplicate elements > 1, testing no=4
// else
// limit_value = 1; // duplicate elements > 1
if (elem.second > limit_value) { // duplicate elements > assign a limit value
// std::vector<std::string>::iterator it = std::find(vecOfStings.begin(), vecOfStings.end(), elem.first); // find x0 = elem.first
// vector<int>::iterator it = find_if(x0s_out.begin(), x0s_out.end(), elem.first); // find x0 = elem.first
// if (it != vecOfStings.end()) // print index if find
// cout << "found " << *it << ", index: " << std::distance(vecOfStings.begin(), it) << endl;
// else
// cout << "not find" << endl;
int index_value = 0;
int result_value = 0;
std::vector<std::string> vecOfLabels;
std::map<std::string, int> duplicateLabels;
// to find until ending (search many times)
for (int i = 0; i < elem.second; i++) {
result_value = searchResult(vecOfStings, elem.first, index_value);
// if (debug == '2')
// std::cout << "found = [feature-x0]: " << elem.first << ", index: " << result_value << ", label: " << labels[result_value] << std::endl;
vecOfLabels.push_back(labels[result_value]);
index_value = result_value + 1; // find the next(+1)
}
// print_vector(vecOfLabels);
// if (debug == '2')
// std::cout << "features : " << elem.first << " :: " << elem.second << std::endl;
// if elem.first == -nan, that means not found
// std::cout << "elem.first: " << typeid(elem.first).name() << std::endl;
// if (elem.first == "-nan") {
x0_duplicate_elements.push_back(elem.second); // insert to feature-x0 duplicate elements
// Get the duplicate elements in vector(labels)
findDuplicates_1(vecOfLabels, duplicateLabels);
// int label_subtotal = 0; // subtotal the duplicate label
for (auto & elem_lbl : duplicateLabels) {
// if (debug == '2')
// std::cout << "labels : " << elem_lbl.first << " :: " << elem_lbl.second << std::endl;
// label_subtotal += elem_lbl.second;
label_duplicate_elements.push_back(elem_lbl.second); // insert (subtotal) to label duplicate elements
}
label_duplicate_elements.push_back(-1); // insert (-1 for x0-label0) to label duplicate elements
// }
}
}
// std::cout << "x0_duplicate_elements size: " << x0_duplicate_elements.size() << std::endl;
// std::cout << "label_duplicate_elements size: " << label_duplicate_elements.size() << std::endl;
// erase all elements
vecOfStings.clear();
duplicateElements.clear();
// if (debug == '1')
// return EXIT_SUCCESS;
// >>(1). p(x0=float1) = 5 / 5000
// print_vector(x0_duplicate_elements);
// for (int i = 0; i < x0_duplicate_elements.size(); ++i)
// cout << x0_duplicate_elements[i] << endl;
// for (int j = 0; j < label_duplicate_elements.size(); ++j)
// cout << label_duplicate_elements[j] << endl;
// x0_duplicate_elements
// std::vector<std::string> result_entropy; // to store the final entropy
float p_x0 = 0.0f;
float labels_size = labels.size(); // total of labels
float p_y_x0 = 0.0f;
float h_y_x0s = 0.0f;
float h_y_x0 = 0.0f;
int j_idx = 0; // to store j position
// x0_duplicate_elements.size() == 0, means not found
// if (x0_duplicate_elements.size() > 0) {
for (int i = 0; i < x0_duplicate_elements.size(); i++) {
p_x0 = x0_duplicate_elements[i] / labels_size;
// cout << "p(x0=float1) = " << p_x0 << endl;
// >>(2). H(Y|x0=float1) = -( p(y=1|x0=float1) * log(p(y=1|x0=float1)) + p(y=0|x0=float1) * log(y=0|p(x0=float1)) + p(y=6|x0=float1) * log(y=6|p(x0=float1)) )
// label_duplicate_elements
p_y_x0 = 0.0f;
h_y_x0s = 0.0f;
for (int j = j_idx; j < label_duplicate_elements.size(); j++) {
j_idx += 1;
if (label_duplicate_elements[j] == -1) {
break;
} else {
// std::cout << label_duplicate_elements[j] << std::endl;
// std::cout << x0_duplicate_elements[i] << std::endl;
p_y_x0 = (float)label_duplicate_elements[j] / (float)x0_duplicate_elements[i]; // int to float
// cout << "p(y=1|x0=float1) * log(p(y=1|x0=float1)) = " << p_y_x0 << endl;
h_y_x0s = h_y_x0s - (p_y_x0 * log(p_y_x0)); // log(base e)
// cout << "H(Y|x0=float1) = " << h_y_x0s << endl;
}
}
// >>(3). H(Y|X0) = p(x0=float1) * H(Y|x0=float1) + p(x0=float2) * H(Y|X0=float2)
h_y_x0 = h_y_x0 + p_x0 * h_y_x0s;
if (h_y_x0 >= 1.0) { // this value is wrong
h_y_x0 = 0.0f;
}
// if (debug == '2')
// cout << "H(Y|X0) = " << h_y_x0 << endl;
}
// }
// if (debug == '1')
// return EXIT_SUCCESS;
cout << "H(Y|X0) = " << h_y_x0 << endl;
result_entropy.push_back(std::to_string(h_y_x0)); // the final result (conditional entropy)
// erase all elements
// x0s.erase(x0s.begin(), x0s.end()); // erase all elements for x0's column
x0_duplicate_elements.clear();
label_duplicate_elements.clear();
// write to csv
/*
ofstream outFile;
outFile.open("./x0_9_label.csv");
outFile << "x0,label" << endl; // header
for (int i = 0; i < x0s_out.size(); i++) {
outFile << x0s_out[i] << delimiter
<< labels[i] << endl;
}
outFile.close();
*/
cout << "Finish fearures' column: " << x_column << ", Percentage: " << setprecision(2)
<< fixed << (x_column / (float)end_column * 100.0) << " %" << endl;
// if (x_column > 13) // for debug
// break; // break for loop for testing feature's columns
}
// >-> from feature's fields(columns) range: from 0 to 30976
// write to csv
writeToCSV1("./flatten_entropy_" + currentDateTime() + ".csv", "x,entropy", result_entropy);
// string ofilename("./flatten_entropy_" + currentDateTime() + ".csv");
// ofstream outFile;
// // outFile.open(ofilename, std::ios_base::app); // seek to the end of stream before each write
// outFile.open(ofilename); // seek to the end of stream before each write
// outFile << "x,entropy" << endl; // header
// for (int i = 0; i < result_entropy.size(); i++) {
// outFile << i << delimiter
// << result_entropy[i] << endl;
// }
// outFile.close();
// <-- execution <-- end
// stop timer.
gettimeofday(&end, NULL);
// Calculating total time taken by the program.
double time_taken;
time_taken = (end.tv_sec - start.tv_sec) * 1e6;
time_taken = (time_taken + (end.tv_usec - start.tv_usec)) * 1e-6;
cout << "Time taken by program is: " << fixed
<< setprecision(6) << time_taken << " seconds." << endl;
Logger(filename + ": time taken by program is: " + std::to_string(time_taken) + " seconds.");
exit(EXIT_SUCCESS);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment