Richardn2002/ranksum.cpp

## ranksum.cpp
#include <algorithm>
#include <cmath>
#include <vector>

using std::erfc;
using std::sort;
using std::sqrt;
using std::vector;

double calc(const vector<double> &sample1, const vector<double> &sample2) {
    vector<double> former(sample1);
    vector<double> latter(sample2);
    sort(former.begin(), former.end());
    sort(latter.begin(), latter.end());

    // iterator to the current entry in former
    auto it_former = former.begin();
    // iterator to the current entry in latter
    auto it_latter = latter.begin();

    // continue increasing no matter of ties
    unsigned int cnt = 0;
    double ranksum = 0.0;

    while (it_former != former.end()) {
        double entry_former = *it_former;

        // the previous value, to determine ties
        double entry_prev = NAN;
        // the length of current tie
        double tie_cnt = 0;

        double entry_latter;
        while (it_latter != latter.end() && (entry_latter = *it_latter) <= entry_former) {
            if (entry_latter == entry_prev) {
                ++tie_cnt;
            } else {
                // different from prev, resetting
                tie_cnt = 1;
                entry_prev = entry_latter;
            }
            ++cnt;
            ++it_latter;
        }

        if (entry_prev != entry_former) {
            // tie does not continue into former
            tie_cnt = 1;
        } else {
            ++tie_cnt;
        }
        // how many entries in the current tie are from former
        double tie_cnt_former = 1;
        // consume current entry
        ++cnt;

        // deplete any possible ties
        while ((++it_former) != former.end() && *it_former == entry_former) {
            ++tie_cnt;
            ++tie_cnt_former;
            ++cnt;
        }

        // ranksum += the average of all integers within [cnt - tie_cnt + 1, cnt] * tie_cnt_former
        ranksum += (cnt - tie_cnt + 1 + cnt) * tie_cnt_former / 2.0;
    }

    unsigned int n1 = sample1.size();
    unsigned int n2 = sample2.size();

    // the ranksum of the latter sample is approximated as a norm dist,
    // with parameters (mean and variance) as following:
    double mean = (n1 * (n1 + n2 + 1)) / 2.0;
    double var = (n1 * n2 * (n1 + n2 + 1)) / 12.0;
    double z = (ranksum - mean - 0.5 + (ranksum < mean)) / sqrt(var);

    double p = 1.0 - 0.5 * erfc(-z * sqrt(0.5));

    return p;
}

#include <iostream>

int main() {
    // example samples from my "Probabilistic Methods in Engineering" lecture slides, cr. Prof. Horst Hohberger,
    // University of Michigan - Shanghai Jiao Tong University Joint Institute
    vector<double> small = {18.5, 12.25, 3, 15, 19.75, 11.25, 11.75, 19.25, 12.25, 19.75, 16.25, 13, 19.25, 1.75};
    vector<double> large = {5.5,  5.5,  12.75, 18.75, 19.25, 11.25, 11.5, 11.5,  12.25, 14.25,
                            9.25, 14.5, 13.25, 8.25,  16.75, 10.5,  6,    15.25, 6.5,   12.5,
                            10.5, 8.75, 11.5,  17,    2.75,  13.25, 19,   16.5,  11.5,  1.75};

    std::cout << calc(small, large) << std::endl;

    return 0;
}
	#include <algorithm>
	#include <cmath>
	#include <vector>

	using std::erfc;
	using std::sort;
	using std::sqrt;
	using std::vector;

	double calc(const vector<double> &sample1, const vector<double> &sample2) {
	vector<double> former(sample1);
	vector<double> latter(sample2);
	sort(former.begin(), former.end());
	sort(latter.begin(), latter.end());

	// iterator to the current entry in former
	auto it_former = former.begin();
	// iterator to the current entry in latter
	auto it_latter = latter.begin();

	// continue increasing no matter of ties
	unsigned int cnt = 0;
	double ranksum = 0.0;

	while (it_former != former.end()) {
	double entry_former = *it_former;

	// the previous value, to determine ties
	double entry_prev = NAN;
	// the length of current tie
	double tie_cnt = 0;

	double entry_latter;
	while (it_latter != latter.end() && (entry_latter = *it_latter) <= entry_former) {
	if (entry_latter == entry_prev) {
	++tie_cnt;
	} else {
	// different from prev, resetting
	tie_cnt = 1;
	entry_prev = entry_latter;
	}
	++cnt;
	++it_latter;
	}

	if (entry_prev != entry_former) {
	// tie does not continue into former
	tie_cnt = 1;
	} else {
	++tie_cnt;
	}
	// how many entries in the current tie are from former
	double tie_cnt_former = 1;
	// consume current entry
	++cnt;

	// deplete any possible ties
	while ((++it_former) != former.end() && *it_former == entry_former) {
	++tie_cnt;
	++tie_cnt_former;
	++cnt;
	}

	// ranksum += the average of all integers within [cnt - tie_cnt + 1, cnt] * tie_cnt_former
	ranksum += (cnt - tie_cnt + 1 + cnt) * tie_cnt_former / 2.0;
	}

	unsigned int n1 = sample1.size();
	unsigned int n2 = sample2.size();

	// the ranksum of the latter sample is approximated as a norm dist,
	// with parameters (mean and variance) as following:
	double mean = (n1 * (n1 + n2 + 1)) / 2.0;
	double var = (n1 * n2 * (n1 + n2 + 1)) / 12.0;
	double z = (ranksum - mean - 0.5 + (ranksum < mean)) / sqrt(var);

	double p = 1.0 - 0.5 * erfc(-z * sqrt(0.5));

	return p;
	}

	#include <iostream>

	int main() {
	// example samples from my "Probabilistic Methods in Engineering" lecture slides, cr. Prof. Horst Hohberger,
	// University of Michigan - Shanghai Jiao Tong University Joint Institute
	vector<double> small = {18.5, 12.25, 3, 15, 19.75, 11.25, 11.75, 19.25, 12.25, 19.75, 16.25, 13, 19.25, 1.75};
	vector<double> large = {5.5, 5.5, 12.75, 18.75, 19.25, 11.25, 11.5, 11.5, 12.25, 14.25,
	9.25, 14.5, 13.25, 8.25, 16.75, 10.5, 6, 15.25, 6.5, 12.5,
	10.5, 8.75, 11.5, 17, 2.75, 13.25, 19, 16.5, 11.5, 1.75};

	std::cout << calc(small, large) << std::endl;

	return 0;
	}