Skip to content

Instantly share code, notes, and snippets.

@Richardn2002
Last active August 1, 2023 20:30
Show Gist options
  • Save Richardn2002/6f941d7ecb3d249fdd106e2c6e6aef01 to your computer and use it in GitHub Desktop.
Save Richardn2002/6f941d7ecb3d249fdd106e2c6e6aef01 to your computer and use it in GitHub Desktop.
Wilcoxon Ranksum Test in C++
#include <algorithm>
#include <cmath>
#include <vector>
using std::erfc;
using std::sort;
using std::sqrt;
using std::vector;
double calc(const vector<double> &sample1, const vector<double> &sample2) {
vector<double> former(sample1);
vector<double> latter(sample2);
sort(former.begin(), former.end());
sort(latter.begin(), latter.end());
// iterator to the current entry in former
auto it_former = former.begin();
// iterator to the current entry in latter
auto it_latter = latter.begin();
// continue increasing no matter of ties
unsigned int cnt = 0;
double ranksum = 0.0;
while (it_former != former.end()) {
double entry_former = *it_former;
// the previous value, to determine ties
double entry_prev = NAN;
// the length of current tie
double tie_cnt = 0;
double entry_latter;
while (it_latter != latter.end() && (entry_latter = *it_latter) <= entry_former) {
if (entry_latter == entry_prev) {
++tie_cnt;
} else {
// different from prev, resetting
tie_cnt = 1;
entry_prev = entry_latter;
}
++cnt;
++it_latter;
}
if (entry_prev != entry_former) {
// tie does not continue into former
tie_cnt = 1;
} else {
++tie_cnt;
}
// how many entries in the current tie are from former
double tie_cnt_former = 1;
// consume current entry
++cnt;
// deplete any possible ties
while ((++it_former) != former.end() && *it_former == entry_former) {
++tie_cnt;
++tie_cnt_former;
++cnt;
}
// ranksum += the average of all integers within [cnt - tie_cnt + 1, cnt] * tie_cnt_former
ranksum += (cnt - tie_cnt + 1 + cnt) * tie_cnt_former / 2.0;
}
unsigned int n1 = sample1.size();
unsigned int n2 = sample2.size();
// the ranksum of the latter sample is approximated as a norm dist,
// with parameters (mean and variance) as following:
double mean = (n1 * (n1 + n2 + 1)) / 2.0;
double var = (n1 * n2 * (n1 + n2 + 1)) / 12.0;
double z = (ranksum - mean - 0.5 + (ranksum < mean)) / sqrt(var);
double p = 1.0 - 0.5 * erfc(-z * sqrt(0.5));
return p;
}
#include <iostream>
int main() {
// example samples from my "Probabilistic Methods in Engineering" lecture slides, cr. Prof. Horst Hohberger,
// University of Michigan - Shanghai Jiao Tong University Joint Institute
vector<double> small = {18.5, 12.25, 3, 15, 19.75, 11.25, 11.75, 19.25, 12.25, 19.75, 16.25, 13, 19.25, 1.75};
vector<double> large = {5.5, 5.5, 12.75, 18.75, 19.25, 11.25, 11.5, 11.5, 12.25, 14.25,
9.25, 14.5, 13.25, 8.25, 16.75, 10.5, 6, 15.25, 6.5, 12.5,
10.5, 8.75, 11.5, 17, 2.75, 13.25, 19, 16.5, 11.5, 1.75};
std::cout << calc(small, large) << std::endl;
return 0;
}
@Richardn2002
Copy link
Author

The output from the code is

Calculating the p-value of concluding that the former sample has a larger median than the second sample, with Wilcoxon ranksum test, with tie resolution method being averaging ranks.

If you want a two-sided test, change line 78 to

double p = 1.0 - std::erf(z / std::sqrt(2.0));

@Richardn2002
Copy link
Author

Forgot to apply continuity correction. Corrected.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment