dvsseed/calc_flatten_entropy.cpp

## calc_flatten_entropy.cpp
#include <bits/stdc++.h>
#include <sys/time.h>
#include <iostream>
#include <ctime>
#include <fstream>
#include <vector>
#include <sstream>
#include <map>
#include <algorithm>
#include <cmath>
#include <functional>
#include <string>
#include <array>
#include <iomanip>
// #include "log.h"


using namespace std;

using std::cout;
using std::cerr;
using std::endl;
using std::string;
using std::ifstream;
using std::ostringstream;
using std::istringstream;
using std::array;
using std::find;
using std::distance;
using std::vector;


// Writing a Log file
inline string getCurrentDateTime( string s ){
    time_t now = time(0);
    struct tm tstruct;
    char buf[80];
    tstruct = *localtime(&now);
    if(s == "now")
        strftime(buf, sizeof(buf), "%Y-%m-%d %X", &tstruct);
    else if(s == "date")
        strftime(buf, sizeof(buf), "%Y-%m-%d", &tstruct);
    return string(buf);
};


inline void Logger( string logMsg ){
    string filePath = "./log/log_" + getCurrentDateTime("date") + ".txt";
    string now = getCurrentDateTime("now");
    ofstream ofs( filePath.c_str(), std::ios_base::out | std::ios_base::app );
    ofs << now << '\t' << logMsg << '\n';
    ofs.close();
}
// Usage: Logger("This is log message"); Writes a file (or appends existing file)


// Get current date/time, format is YYYY-MM-DD.HH:mm:ss
const std::string currentDateTime() {
    time_t     now = time(0);
    struct tm  tstruct;
    char       buf[80];
    tstruct = *localtime(&now);
    // Visit http://en.cppreference.com/w/cpp/chrono/c/strftime
    // for more information about date/time format
    // strftime(buf, sizeof(buf), "%Y-%m-%d.%X", &tstruct);
    strftime(buf, sizeof(buf), "%Y%m%d%H%M%S", &tstruct);  // change to YYYYMMDDHHMMSS

    return buf;
}


// Write to csv [feature, labels]
void writeToCSV(const std::string& path, const std::string& title,
    std::vector<std::string>& vecOfXs, std::vector<std::string>& vecOfLabels) {
    char delimiter = ',';
    ofstream outFile;
    outFile.open(path);
    outFile << title << endl;  // header
    for (int i = 0; i < vecOfXs.size(); i++) {
        outFile << vecOfXs[i] << delimiter
                << vecOfLabels[i] << std::endl;
    }
    outFile.close();
}


// Write to csv [no, entropy]
void writeToCSV1(const std::string& path, const std::string& title, std::vector<std::string>& vecOfXs) {
    char delimiter = ',';
    // string ofilename("./flatten_entropy_" + currentDateTime() + ".csv");
    ofstream outFile;
    // outFile.open(ofilename, std::ios_base::app);  // seek to the end of stream before each write
    outFile.open(path);  // seek to the end of stream before each write
    outFile << title << endl;  // header

    for (int i = 0; i < vecOfXs.size(); i++) {
        outFile << i << delimiter
                << vecOfXs[i] << endl;
    }
    outFile.close();
}


// Print contents of an array
template<typename T, size_t n>
void print_array(T const(& arr)[n]) {
    for (size_t i = 0; i < n; i++) {
        std::cout << arr[i] << ' ';
    }
}


// Print the contents of vector
template<typename T>
void print_vector(T & vecOfElements, std::string delimeter = ", ") {
    for (auto elem : vecOfElements)
        std::cout << elem << delimeter;
    std::cout << std::endl;
}


string readFileIntoString(const string& path) {
    auto ss = ostringstream{};
    ss.clear();
    ifstream input_file(path);
    if (!input_file.is_open()) {
        cerr << "Could not open the file - '" << path << "'" << endl;
        exit(EXIT_FAILURE);
    }
    ss << input_file.rdbuf();
    return ss.str();
}


std::vector<string> normalized(std::vector<string> features) {
    // convert string to float
    float feature_first = std::stof(features[0]);
    float smallest_element = feature_first;  // first element is the smallest one
    float largest_element = feature_first;  // first element is the biggest one
    std::string feature_value = "";
    float feature_float = 0.0f;
    std::vector<string> features_out;
    for (int i = 1; i < features.size(); i++) {  // start iterating from the second element
        feature_float = std::stof(features[i]);
        if(feature_float < smallest_element) {
           smallest_element = feature_float;
        }
        if(feature_float > largest_element) {
           largest_element = feature_float;
        }
    }

    for (int i = 0; i < features.size(); i++) {  // start iterating from the second element
        feature_float = std::stof(features[i]);
        // features_out.push_back(std::to_string((feature_float - smallest_element) / (largest_element - smallest_element)));
        feature_value = std::to_string((feature_float - smallest_element) / (largest_element - smallest_element));
        if (feature_value == "-nan")
            features_out.push_back("0.0");
        else
            features_out.push_back(feature_value);
    }
    return features_out;
}


/*
 * Generic function to find duplicates elements in vector.
 * It adds the duplicate elements and their duplication count in given map countMap
 */
template <typename T>
void findDuplicates(std::vector<T> & vecOfElements, std::map<T, int> & countMap) {
    // Iterate over the vector and store the frequency of each element in map
    for (auto & elem : vecOfElements) {
        auto result = countMap.insert(std::pair<std::string, int>(elem, 1));
        if (result.second == false)
            result.first->second++;
    }
    // Remove the elements from Map which has 1 frequency count
    for (auto it = countMap.begin(); it != countMap.end();) {
        if (it->second == 1)
            it = countMap.erase(it);
        else
            it++;
    }
}


template <typename T>
void findDuplicates_1(std::vector<T> & vecOfElements, std::map<T, int> & countMap) {
    // Iterate over the vector and store the frequency of each element in map
    for (auto & elem : vecOfElements) {
        auto result = countMap.insert(std::pair<std::string, int>(elem, 1));
        if (result.second == false)
            result.first->second++;
    }
    // DON'T DO THIS => remove the elements from Map which has 1 frequency count
}


// Return index of the element using std::find()
template <typename T>
int searchResult(std::vector<T> arr, T keyword, int elem) {
    std::vector<std::string>::iterator it;
    it = std::find(arr.begin() + elem, arr.end(), keyword);  // specify elements relative to the start
    if (it != arr.end())
        return (it - arr.begin());
    else
        return -1;
}


int main()
{
    // for debug
    // char debug = '0';  // 1: debug on, 0: turn off

    // making log file
    // Logger("This is a test\n");
    // Logger("This is another test\n");
    // Logger("\n--------------------\n");
    // Logger("This is the third test\n");
    // Logger("\n--------------------\n");
    // exit(EXIT_SUCCESS);

    /* The function gettimeofday() can get the time as
       well as timezone.
       int gettimeofday(struct timeval *tv, struct timezone *tz);
      The tv argument is a struct timeval and gives the
      number of seconds and micro seconds since the Epoch.
      struct timeval {
               time_t      tv_sec;     // seconds
               suseconds_t tv_usec;    // microseconds
           };    */
    struct timeval start, end;

    // start timer.
    gettimeofday(&start, NULL);

    // unsync the I/O of C and C++.
    ios_base::sync_with_stdio(false);

    // string testFilename("./flatten_entropy_" + currentDateTime() + ".csv");
    // std::cout << "Current DateTime: " << currentDateTime() << std::endl;
    // std::cout << "Filename DateTime: " << testFilename << std::endl;
    // return EXIT_SUCCESS;


    // --> start execution -->

    // read a file of csv
    string filename("flatten_labels_20220125214627_GC.csv");  // <<<=== must modify here
    string file_contents("");
    std::map<int, std::vector<string>> csv_contents;  // pairs: <key, value>
    char delimiter = ',';

    file_contents = readFileIntoString(filename);
    istringstream sstream(file_contents);
    std::vector<std::string> items;

    // std::vector<std::string> indexes;
    // std::vector<std::string> subindexes;
    // std::vector<std::string> x0s;
    // std::vector<std::string> x0s_out;
    std::vector<std::string> labels;
    // std::vector<int> labels;  // label: float->int
    string record("");  // row

    int counter = 0;  // total of rows
    std::getline(sstream, record);  // delete header = the first line
    istringstream line(record);
    while (std::getline(line, record, delimiter)) {
        counter += 1;  // to count columns
    }
    // std::cout << "total of column: " << counter << std::endl;
    // exit(EXIT_SUCCESS);
    int end_column = counter - 1;  // for loop ending columns: 30978

    counter = 0;

    // >-> from feature's fields(columns) range: from 0 to 30976
    // string x0;  // column[2]
    // string label;  // column[30979]

    // int counter = 0;  // total of rows
    // int column_index = 0;  // total of columns

    // from csv file to vector, map
    while (std::getline(sstream, record)) {
        istringstream line(record);

        while (std::getline(line, record, delimiter)) {
            items.push_back(record);
        }
        csv_contents[counter] = items;
        items.clear();

        counter += 1;

        /*
        std::getline(line, record, delimiter);  // col[0]: index
        indexes.push_back(record);
        std::getline(line, record, delimiter);  // col[1]: sub-index
        subindexes.push_back(record);

        for (int i = 0; i < (column_index + 1); i++) {  // features: from no#0
            std::getline(line, record, delimiter);  // col[2]: x0
        }
        x0s.push_back(record);  // to store the x0's vector

        // std::getline(line, record, delimiter);  // col[3]: x1
        // x1s.push_back(record);

        // only execute one time for labels
        if (column_index == 0) {
            for (int i = 0; i < (30975 - column_index); i++) {  // features: to no#30977
                std::getline(line, record, delimiter);  // col[n]: xn
            }

            std::getline(line, record, delimiter);  // col[30978]: label
            labels.push_back(record.replace(record.find(".0"), 2, ""));  // label: float->int
            // string replace_str;
            // replace_str = record.replace(record.find(".0"), 2, "");
            // cout << "==> " << replace_str << ", type: " << typeid(replace_str).name() << endl;
            // string replace_str;
            // replace_str = record.replace(record.find(".0"), 2, "");
            // labels.push_back(replace_str);  // label: float->int
        }
        */
    }
    file_contents = "";  // erase csv file
    // cout << "Row counter: " << counter << " lines." << endl;
    // cout << "CSV contents size: " << csv_contents.size() << endl;

    int start_column = 2;  // for loop starting columns: 2
    // if (debug == '1')
        // start_column = 12;  // for loop starting columns: 13
    // else
        // start_column = 2;  // for loop starting columns: 2
    // int end_column = 30978;  // for loop ending columns: 30978
    std::vector<std::string> result_entropy;  // to store the final entropy calculation
    for (int x_column = start_column; x_column < end_column; x_column++) {
        // int counter_col = 0;
        // int x_column = 2;  // feature-x number, column: [2..30977]

        std::vector<std::string> x0s;  // feature-x vector
        std::vector<std::string> x0s_out;  // feature-x normalized vector

        auto it = csv_contents.begin();
        // Iterate through the map
        while(it != csv_contents.end())
        {
            // break condition
            // if (counter_col == 1)
                // break;

            // std::cout << std::endl << "key: " << it->first   // int's key -> row number
                      // << ':';
            // << it->second  // string's value -> row contents(features + label)
            // int counter_item = 0;
            // for (const auto &item : it->second) {
                // // std::cout << item << " ";
                // counter_item += 1;  // col[0..30978] = 30979
            // }
            // std::cout << std::endl << "items: " << counter_item << std::endl;
            // std::cout << std::endl << "items[0]: " << it->second[0] << std::endl;  // col[0]: index
            // std::cout << std::endl << "items[1]: " << it->second[1] << std::endl;  // col[1]: sub-index
            // std::cout << std::endl << "items[2]: " << it->second[2] << std::endl;  // col[2]: x0
            // std::cout << std::endl << "items[30978]: " << it->second[30978] << std::endl;  // col[30978]: label

            x0s.push_back(it->second[x_column]);  // to store the x0's vector

            if (x_column == start_column) {  // to insert label's vector just once time
                string replace_str("");
                replace_str = it->second[end_column];  // col[30978]=label
                replace_str = replace_str.replace(replace_str.find(".0"), 2, "");  // to delete .0
                labels.push_back(replace_str);  // to store the label's vector
            }

            // counter_col += 1;
            // Go to next entry in map
            it++;
        }
        // x_column += 1;  // shift a column (feature-x)
        // cout << "x0s size: " << x0s.size() << endl;
        // cout << "labels size: " << labels.size() << endl;
        // print_vector(labels);

        // column_index += 1;  // to skip the next features-x(n)

        // size_t x0_size = x0s.size();
        // cout << "x0's size: " << x0_size << endl;

        // print x0's values
        // int counter_x0 = 0;
        // for (const auto &item : x0s) {
            // std::cout << item << " ";
            // counter_x0 += 1;
        // }
        // cout << endl << "counter x0: " << counter_x0 << endl;

        // print label's values
        // int counter_label = 0;
        // for (const auto &item : labels) {
            // std::cout << item << " ";
            // counter_label += 1;
        // }
        // cout << endl << "counter label: " << counter_label << endl;

        // normalized -> x0
        // convert string to float
        // float x0_first = std::stof(x0s[0]);
        // float smallest_element = x0_first;  // first element is the smallest one
        // float largest_element = x0_first;  // first element is the biggest one
        // float x0_f = 0;
        // for (int i = 1; i < x0s.size(); i++) {  // start iterating from the second element
            // x0_f = std::stof(x0s[i]);
            // if(x0_f < smallest_element) {
               // smallest_element = x0_f;
            // }
            // if(x0_f > largest_element) {
               // largest_element = x0_f;
            // }
        // }

        // for (int i = 0; i < x0s.size(); i++) {  // start iterating from the second element
            // x0_f = std::stof(x0s[i]);
            // x0s_out.push_back(std::to_string((x0_f - smallest_element) / (largest_element - smallest_element)));
        // }

        /*
        // Joint entropy -> [label]
        // H(Y) = -Summation p(y) * log(p(y))
        // array to store 10 classes: [0..9]
        float classes[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
        // cout << "before classes: ";
        // print_array(classes);
        // std::cout << endl;
        int array_size = sizeof classes / sizeof classes[0];

        for (int i = 0; i < labels.size(); i++) {
            for (int c = 0; c < 10; c++) {
                if (std::stoi(labels[i]) == c) {
                    classes[c] += 1;  // for 10 classes
                }
            }
        }
        // cout << "after classes: ";
        // print_array(classes);
        // std::cout << endl;
        float probability = 0.0f;
        float entropy_label = 0.0f;

        for (int i = 0; i < array_size; i++) {
            probability = classes[i] / labels.size();
            entropy_label = entropy_label - (probability * log(probability));  // log(base e)
            // cout << probability << ", " << entropy_label << endl;
        }
        cout << "label of joint entropy: " << entropy_label << endl;
        */

        // features -> normalized by function
        // x0s_out.clear();  // erase all elements for x0s out
        x0s_out = normalized(x0s);
        // print_vector(x0s_out);
        x0s.clear();  // erase all elements for x0s

        // write to csv
        // ofstream outFile;
        // outFile.open("./x13_label.csv");
        // outFile << "x13,label" << endl;  // header
        // for (int i = 0; i < x0s_out.size(); i++) {
            // outFile << x0s_out[i] << delimiter
                    // << labels[i] << endl;
        // }
        // outFile.close();
        // return EXIT_SUCCESS;

        // cout << "index: " << column_index << ", x0's size: " << x0s_out.size() << endl;
        // cout << "before x0s: " << x0s.size() << endl;
        // std::vector<std::string, allocator<std::string>>::iterator begin = x0s.begin(), end = x0s.end();
        // x0s.erase(begin, end);
        // cout << "after x0s: " << x0s.size() << endl;

        // copy string vector to float vector
        // float* x_out = new float[x0s_out.size()];
        // memcpy(x_out, &x0s_out[0], sizeof(float) * x0s_out.size());

        // print maximum element
        // string x0s_max = *std::max_element(x0s_out.begin(), x0s_out.end());
        // std::cout << "x0's max element: " << x0s_max << endl;
        // print minimum element
        // string x0s_min = *std::min_element(x0s_out.begin(), x0s_out.end());
        // std::cout << "x0's min element: " << x0s_min << endl;

        // Conditional entropy -> [features, labels]
        // H(Y|X) = -Summation p(x)*H(Y|X=x)
        /*
         * Finding duplicates in vector using generic function
         */
        // Vector of strings
        std::vector<std::string> vecOfStings = x0s_out;
        x0s_out.clear();  // erase all elements
        std::map<std::string, int> duplicateElements;
        // Get the duplicate elements in vector(features-x0)
        findDuplicates(vecOfStings, duplicateElements);

        // std::cout << "Duplicate elements and their duplication count => (x0):" << std::endl;
        std::vector<int> x0_duplicate_elements;  // feature-x0 duplicate elements
        std::vector<int> label_duplicate_elements;  // label duplicate elements
        for (auto & elem : duplicateElements) {
            int limit_value = 1;  // duplicate elements > 1
            // if (debug == '1')
                // limit_value = 4;  // duplicate elements > 1, testing no=4
            // else
                // limit_value = 1;  // duplicate elements > 1

            if (elem.second > limit_value) {  // duplicate elements > assign a limit value
                // std::vector<std::string>::iterator it = std::find(vecOfStings.begin(), vecOfStings.end(), elem.first);  // find x0 = elem.first
                // vector<int>::iterator it = find_if(x0s_out.begin(), x0s_out.end(), elem.first);  // find x0 = elem.first
                // if (it != vecOfStings.end())  // print index if find
                    // cout << "found " << *it << ", index: " << std::distance(vecOfStings.begin(), it) << endl;
                // else
                    // cout << "not find" << endl;
                int index_value = 0;
                int result_value = 0;
                std::vector<std::string> vecOfLabels;
                std::map<std::string, int> duplicateLabels;
                // to find until ending (search many times)
                for (int i = 0; i < elem.second; i++) {
                    result_value = searchResult(vecOfStings, elem.first, index_value);
                    // if (debug == '2')
                        // std::cout << "found = [feature-x0]: " << elem.first << ", index: " << result_value << ", label: " << labels[result_value] << std::endl;
                    vecOfLabels.push_back(labels[result_value]);
                    index_value = result_value + 1;  // find the next(+1)
                }

                // print_vector(vecOfLabels);

                // if (debug == '2')
                    // std::cout << "features : " << elem.first << " :: " << elem.second << std::endl;

                // if elem.first == -nan, that means not found
                // std::cout << "elem.first: " << typeid(elem.first).name() << std::endl;
                // if (elem.first == "-nan") {
                x0_duplicate_elements.push_back(elem.second);  // insert to feature-x0 duplicate elements

                // Get the duplicate elements in vector(labels)
                findDuplicates_1(vecOfLabels, duplicateLabels);
                // int label_subtotal = 0;  // subtotal the duplicate label
                for (auto & elem_lbl : duplicateLabels) {
                    // if (debug == '2')
                        // std::cout << "labels : " << elem_lbl.first << " :: " << elem_lbl.second << std::endl;
                    // label_subtotal += elem_lbl.second;
                    label_duplicate_elements.push_back(elem_lbl.second);  // insert (subtotal) to label duplicate elements
                }

                label_duplicate_elements.push_back(-1);  // insert (-1 for x0-label0) to label duplicate elements
                // }
            }
        }
        // std::cout << "x0_duplicate_elements size: " << x0_duplicate_elements.size() << std::endl;
        // std::cout << "label_duplicate_elements size: " << label_duplicate_elements.size() << std::endl;
        // erase all elements
        vecOfStings.clear();
        duplicateElements.clear();
        // if (debug == '1')
            // return EXIT_SUCCESS;

        // >>(1). p(x0=float1) = 5 / 5000
        // print_vector(x0_duplicate_elements);
        // for (int i = 0; i < x0_duplicate_elements.size(); ++i)
            // cout << x0_duplicate_elements[i] << endl;
        // for (int j = 0; j < label_duplicate_elements.size(); ++j)
            // cout << label_duplicate_elements[j] << endl;
        // x0_duplicate_elements
        // std::vector<std::string> result_entropy;  // to store the final entropy
        float p_x0 = 0.0f;
        float labels_size = labels.size();  // total of labels
        float p_y_x0 = 0.0f;
        float h_y_x0s = 0.0f;
        float h_y_x0 = 0.0f;
        int j_idx = 0;  // to store j position
        // x0_duplicate_elements.size() == 0, means not found
        // if (x0_duplicate_elements.size() > 0) {
        for (int i = 0; i < x0_duplicate_elements.size(); i++) {
            p_x0 = x0_duplicate_elements[i] / labels_size;
            // cout << "p(x0=float1) = " << p_x0 << endl;

            // >>(2). H(Y|x0=float1) = -( p(y=1|x0=float1) * log(p(y=1|x0=float1)) + p(y=0|x0=float1) * log(y=0|p(x0=float1)) + p(y=6|x0=float1) * log(y=6|p(x0=float1)) )
            // label_duplicate_elements
            p_y_x0 = 0.0f;
            h_y_x0s = 0.0f;
            for (int j = j_idx; j < label_duplicate_elements.size(); j++) {
                j_idx += 1;
                if (label_duplicate_elements[j] == -1) {
                    break;
                } else {
                    // std::cout << label_duplicate_elements[j] << std::endl;
                    // std::cout << x0_duplicate_elements[i] << std::endl;
                    p_y_x0 = (float)label_duplicate_elements[j] / (float)x0_duplicate_elements[i];  // int to float
                    // cout << "p(y=1|x0=float1) * log(p(y=1|x0=float1)) = " << p_y_x0 << endl;
                    h_y_x0s = h_y_x0s - (p_y_x0 * log(p_y_x0));  // log(base e)
                    // cout << "H(Y|x0=float1) = " << h_y_x0s << endl;
                }
            }

            // >>(3). H(Y|X0) = p(x0=float1) * H(Y|x0=float1) + p(x0=float2) * H(Y|X0=float2)
            h_y_x0 = h_y_x0 + p_x0 * h_y_x0s;

            if (h_y_x0 >= 1.0) {  // this value is wrong
                h_y_x0 = 0.0f;
            }

            // if (debug == '2')
                // cout << "H(Y|X0) = " << h_y_x0 << endl;
        }
        // }
        // if (debug == '1')
            // return EXIT_SUCCESS;

        cout << "H(Y|X0) = " << h_y_x0 << endl;
        result_entropy.push_back(std::to_string(h_y_x0));  // the final result (conditional entropy)

        // erase all elements
        // x0s.erase(x0s.begin(), x0s.end());  // erase all elements for x0's column
        x0_duplicate_elements.clear();
        label_duplicate_elements.clear();

        // write to csv
        /*
        ofstream outFile;
        outFile.open("./x0_9_label.csv");
        outFile << "x0,label" << endl;  // header

        for (int i = 0; i < x0s_out.size(); i++) {
            outFile << x0s_out[i] << delimiter
                    << labels[i] << endl;
        }
        outFile.close();
        */

        cout << "Finish fearures' column: " << x_column << ", Percentage: " << setprecision(2)
             << fixed << (x_column / (float)end_column * 100.0) << " %" << endl;

        // if (x_column > 13)  // for debug
            // break;  // break for loop for testing feature's columns
    }
    // >-> from feature's fields(columns) range: from 0 to 30976


    // write to csv
    writeToCSV1("./flatten_entropy_" + currentDateTime() + ".csv", "x,entropy", result_entropy);
    // string ofilename("./flatten_entropy_" + currentDateTime() + ".csv");
    // ofstream outFile;
    // // outFile.open(ofilename, std::ios_base::app);  // seek to the end of stream before each write
    // outFile.open(ofilename);  // seek to the end of stream before each write
    // outFile << "x,entropy" << endl;  // header

    // for (int i = 0; i < result_entropy.size(); i++) {
        // outFile << i << delimiter
                // << result_entropy[i] << endl;
    // }
    // outFile.close();

    // <-- execution <-- end


    // stop timer.
    gettimeofday(&end, NULL);

    // Calculating total time taken by the program.
    double time_taken;

    time_taken = (end.tv_sec - start.tv_sec) * 1e6;
    time_taken = (time_taken + (end.tv_usec - start.tv_usec)) * 1e-6;

    cout << "Time taken by program is: " << fixed
         << setprecision(6) << time_taken << " seconds." << endl;
    Logger(filename + ": time taken by program is: " + std::to_string(time_taken) + " seconds.");

    exit(EXIT_SUCCESS);
}