adrianolszewski/quantreg_mww.r

## quantreg_mww.r
# Let's make some data to play with

set.seed(1234)
v1 <- rexp(500)
v2 <- rnorm(500) + log(2)
v3 <- -rgamma(500, 2.5, 3)
v4 <- runif(500, -2,4)

# Look at the data
layout(matrix(c(1:4), nrow = 2))
hist(v1, xlim=c(-3, 5))
abline(v = median(v1), col="red", lwd=2)
hist(v2, xlim=c(-3, 5));
abline(v = median(v2), col="red", lwd=2)
hist(v3, xlim=c(-3, 5));
abline(v = median(v3), col="red", lwd=2)
hist(v4, xlim=c(-3, 5));
abline(v = median(v4), col="red", lwd=2)

# I will start from the quantile regression. Actually it doesn't matter where we will start.

# For a 2-level categorical predictor variables it's equivalent to comapre 2 medians (test for their differences).
# Here I use bootstrap to calculate the standard error.
# It's probably the safest option at this stage, and relaxes me from justifying the other options (believe me or not, it would be a complex discussion...)

stacked <- stack(data.frame(v1, v2))
summary(rq(values~ind, data=stacked), se="boot", R=9999)

# Call: rq(formula = values ~ ind, data = stacked)
#
# tau: [1] 0.5
#
# Coefficients:
#             Value    Std. Error t value  Pr(>|t|)
# (Intercept)  0.68712  0.04500   15.26806  0.00000
# indv2       -0.03418  0.06435   -0.53120  0.59540   <--- you can use this test (Value coef. is the difference)
# Warning message:
# In rq.fit.br(x, y, tau = tau, ...) : Solution may be nonunique

# EM-means (estimated marginal means) is another name of the well-known in experimental research LS-means (least-square means)
# It's a model-based predicted (estimated) mean. If you know the definition of regression (NO, not the Machine Learning one...)
# then you know that regresion gives you a some function of the data conditional to the predictor.
# For the linear regression it's E(Y|X=x), for the GLM it is link(E(Y|X=x)), and here it's the median(Y|X=x).
# And since the predictor exclusively consists of categorical variables, they form sub-groups in which the (conditional)
# medians are calculated. Ignore the labelling - emmeans here just stand for predicted medians (check numerically if you don't believe me).

(emmeans_model <- emmeans::emmeans(rq(values~ind, data=stacked), ~ind))
# ind emmean     SE  df lower.CL upper.CL
#  v1   0.687 0.0441 998    0.601    0.774
#  v2   0.653 0.0505 998    0.554    0.752

# Confidence level used: 0.95
# Warning message:
# In rq.fit.br(x, y, tau = tau, ...) : Solution may be nonunique

# Compare the obtainedm edians. Effectively it's nothing but a between-group comparison of medians.

pairs(emmeans_model)
#  contrast estimate     SE  df t.ratio p.value
# v1 - v2    0.0342 0.0671 998   0.510  0.6105    <- (minor difference from the above Wald's test for the model coeff.; testing over predictions)

# For 2+ groups we can test the coefficients jointly. If we wored with the linear regression model, this would result 1:1 in ANOVA.
stacked <- stack(data.frame(v1, v2,  v3, v4))

emmeans_model <- emmeans::emmeans(rq(values~ind, data=stacked), ~ind))
#  ind emmean     SE   df lower.CL upper.CL
#  v1   0.689 0.0440 1996    0.603    0.775
#  v2   0.653 0.0480 1996    0.559    0.747
#  v3  -0.725 0.0254 1996   -0.775   -0.675
#  v4   1.038 0.1470 1996    0.750    1.326

# Confidence level used: 0.95
# Warning message:
# In rq.fit.br(x, y, tau = tau, ...) : Solution may be nonunique

# ANOVA-like table (joint Wald's test); For n-way design this would be the Type-3 analysis
(qrj_test <- emmeans::joint_tests(rq(values~ind, data=stacked)))
#  model term df1  df2 F.ratio p.value
#  ind          3 1996 407.068  <.0001

# Warning message:
# In rq.fit.br(x, y, tau = tau, ...) : Solution may be nonunique

# BTW, we can perform various comparisons via vontrasts; here adjusted for multiplicity
# mvt = exact parametric adjustment employing multi-variate t distribution + estimates of the coefficients and variance-covariance(s)
# (numbers may vary a little due to Monte-Carlo integration over the multivariate t distribution; fix the seed or ignore it).

update(emmeans::contrast(emmeans_model, list(v1_v2 = c(1, -1, 0, 0), v2_v4 = c(0, 1,  0, -1)), adjust = "mvt"), infer = c(TRUE, TRUE))
#  contrast estimate     SE   df lower.CL upper.CL t.ratio p.value
#  v1_v2      0.0363 0.0651 1996   -0.109   0.1816   0.558  0.8168
#  v2_v4     -0.3852 0.1546 1996   -0.730  -0.0399  -2.492  0.0252

# Confidence level used: 0.95
# Conf-level adjustment: mvt method for 2 estimates
# P value adjustment: mvt method for 2 tests

******************************************************************************************
# Now the funny part. Let's compare v1 and v2 via quantile regression and Mann-Whitney (Wilcoxon) test
# Notice the difference. MW(W) is about pseudo-median of change. QR is about change in medians

# Difference in medians
median(v1) - median(v2)
# [1] 0.0367213

# The quantile regression says:
update(emmeans::contrast(emmeans_model, list(v1_v2 = c(1, -1, 0, 0)), infer = c(TRUE, TRUE)))
# contrast estimate     SE   df lower.CL upper.CL t.ratio p.value
# v1_v2      0.0363 0.0651 1996  -0.0913    0.164   0.558  0.5766

# Confidence level used: 0.95

# No statistically significant difference (p=0.577)
# Let's confirm it via Brown-Mood test of medians (do NOT confuse it with R's mood.test, which is for DISPERSIONS (variances more/less)!)

stacked <- stack(data.frame(v1, v2))
RVAideMemoire::mood.medtest(values~ind, stacked, exact=FALSE)

#	Mood's median test
#
# data:  values by ind
# X-squared = 0.484, df = 1, p-value = 0.4866

# The test of medians returned p=0.487, quite close to the quantile regression.
# No surprise - they test almost the same (just through different formulation)

# Now let's calculate the median difference
median(v1-v2)
#[1] 0.237302

# Wow! This is quite different from the difference in medians.
# What works for means, i.e. mean(difference) = difference(means) doesn't work (in general) for medians.

# Is this maybe the Hodges-Lehmann (location) estimator of the pseudo-median?
median(outer(v1, v2, "-")) # pseudomedian for the 2-sample case (https://aakinshin.net/posts/r-hodges-lehmann-problems/)
#[1] 0.2063202

# Nope. They're close, but not exactly same (if they were same, there wouldn't be a different name :D)

# Notice it's the same as the MWW returns
> broom::tidy(wilcox.test(v1, v2, conf.int = TRUE))
# A tibble: 1 × 7
#   estimate statistic  p.value conf.low conf.high method                                            alternative
#      <dbl>     <dbl>    <dbl>    <dbl>     <dbl> <chr>                                             <chr>
# 1    0.206    141300 0.000358   0.0917     0.324 Wilcoxon rank sum test with continuity correction two.sided

# ******************************************************************************************
# The Mann-Whitney (Wilcoxon) found statistically significant difference at p<0.001 (!)
# We can clearly see that Mann-Whitney is definitely NOT a test of difference in medians!
# It tests H0: stochastic equivalence vs. H1: stochastic dominance (superiority).
# Look at the histogram of v1 and v2 and you should see the point.
# Notice the medians are almost the same, but the "dominance" is visible on the skewed one.
# ******************************************************************************************

# Confirm Mann-Whitney with its better version, the Brunner-Munzel test (more robust to between-group dispersions)

tidy(brunnermunzel::brunnermunzel.permutation.test(v1, v2))
# A tibble: 1 × 7
#  estimate statistic  p.value parameter conf.low.lower conf.high.upper method
#     <dbl>     <dbl>    <dbl>     <dbl>          <dbl>           <dbl> <chr>
# 1    0.435     -3.57 0.000379      918.          0.399           0.471 Brunner-Munzel Test
# Warning message:
# In brunnermunzel.permutation.test.default(v1, v2) :
#   Sample number is too large. Using 'brunnermunzel.test'

# If you run the test without broom::tidy, you'll get more information. Read it, it's very important:

#  Brunner-Munzel Test
#
# data:  x and y
# Brunner-Munzel Test Statistic = -3.5673, df = 918.41, p-value = 0.0003793
# 95 percent confidence interval:
#  0.3989302 0.4706698
# sample estimates:
# P(X<Y)+.5*P(X=Y)
#          0.4348

# As you can see, the H0 about statistical equivalence (probability that for a pair of randomly chosen observations {v1, v2}
# from samples V1 and V2, the value of v1 <= value of v2 is < 0.5 and its confidence interval excludes 0.5.

# Similar analysis of the RTE (Relative Treatment Effect, related to the stochastic dominance) using the ATS (ANOVA-Type Statistic)
# and WTS (Wald-Type Statistic) confirms this finding

GFD::GFD(values~ind, stacked)
# Call:
# values ~ ind

# Wald-Type Statistic (WTS):
# Test statistic             df        p-value   p-value WTPS
#   2.941036e+01   1.000000e+00   5.856387e-08   0.000000e+00
#
# ANOVA-Type Statistic (ATS):
# Test statistic            df1            df2        p-value
#   2.941036e+01   1.000000e+00   9.917439e+02   7.357885e-08

# ******************************************************************************************
# Now I'm going to show even a more extreme scenario!
# Same kind of right-skewness (log-normal), with numericall exact medians, differing only by dispersions
# On the list in my comment below find a paper saying that variance matters too.
# Notice the result of the MWW and quantile regression (and Brown-Mood test)

set.seed(1234)
v5 <- rlnorm(500, 0, 1)

# We will scale the data to increase dispersion but keep the medians
variance_factor <- 6
median_v5 <- median(v5)
scaled_dev <- (v5 - median_v5) * sqrt(variance_factor)
v6 <- (median_v5 + scaled_dev)

# Are the variances diffent?
var(v5)
# [1] 5.718385
var(v6)
# [1] 34.31031

# What the various (common) tests say about it?
tidy(var.test(v5, v6))
# Multiple parameters; naming those columns num.df, den.df
# A tibble: 1 × 9
#  estimate num.df den.df statistic  p.value conf.low conf.high method                          alternative
#     <dbl>  <int>  <int>     <dbl>    <dbl>    <dbl>     <dbl> <chr>                           <chr>
#1    0.167    499    499     0.167 2.27e-79    0.140     0.199 F test to compare two variances two.sided

tidy(ansari.test(v5, v6))
# A tibble: 1 × 4
#  statistic p.value method              alternative
#      <dbl>   <dbl> <chr>               <chr>
# 1    157454       0 Ansari-Bradley test two.sided

tidy(mood.test(v5, v6))
# A tibble: 1 × 4
#  statistic  p.value method                        alternative
#      <dbl>    <dbl> <chr>                         <chr>
# 1     -14.9 5.81e-50 Mood two-sample test of scale two.sided

tidy(fligner.test(list(v5, v6)))
# A tibble: 1 × 4
#  statistic  p.value parameter method
#      <dbl>    <dbl>     <dbl> <chr>
# 1      129. 8.26e-30         1 Fligner-Killeen test of homogeneity of variances

# Convinced enough about different dispersions?
# Now let's check the medians. Both diffence(medians) and median(difference)

median(v5) - median(v6)
# [1] 0

median(v5 - v6)
# [1] 0

# So, if the Mann-Whitney (-Wilcoxon) tests medians, the p-value should be practically equal to 1. Shouldn't it?
tidy(wilcox.test(v5, v6, conf.int = TRUE))
# A tibble: 1 × 7
#  estimate statistic p.value conf.low conf.high method                                            alternative
#     <dbl>     <dbl>   <dbl>    <dbl>     <dbl> <chr>                                             <chr>
# 1    0.254    135056  0.0277   0.0293     0.455 Wilcoxon rank sum test with continuity correction two.sided

# OUCH!

# Let's use the Brunner-Munzel test accounting for unequal dispersions:
tidy(brunnermunzel::brunnermunzel.test(v5, v6))
# A tibble: 1 × 7
#  estimate statistic p.value parameter conf.low.lower conf.high.upper method
#     <dbl>     <dbl>   <dbl>     <dbl>          <dbl>           <dbl> <chr>
# 1    0.460     -2.08  0.0384      640.          0.422           0.498 Brunner-Munzel Test

# Better (as it should be), but still we have a problem, Houston!

# And what does the quantile regrsssion say?

> # Quantile regression - notice how the p-value is close to 1
stacked <- stack(data.frame(v5, v6))
summary(rq(values~ind, data=stacked), se="boot", R=9999)

# Call: rq(formula = values ~ ind, data = stacked)
#
# tau: [1] 0.5
#
# Coefficients:
#             Value    Std. Error t value  Pr(>|t|)
# (Intercept)  0.97407  0.05720   17.02877  0.00000
# indv6        0.01882  0.14777    0.12734  0.89870
# Warning message:
# In rq.fit.br(x, y, tau = tau, ...) : Solution may be nonunique

# p-value = 0.899, well...

# ... but maybe the Brown-Mood test of medians...?

tidy(RVAideMemoire::mood.medtest(values~ind, stacked, exact=FALSE))
# A tibble: 1 × 4
#  statistic p.value parameter method
#      <dbl>   <dbl>     <int> <chr>
# 1         0       1         1 Mood's median test

# nope. The medians were equal and the tests OF MEDIANS can see it even at this sample size (N=500 per group).

# ******************************************************************************************
# OK, but it could be just a type-1 error. If it's controlled by the test at some low rate, like 5%, it's OK!

# Is it?

# I will evaluate 200 examples with same medians and different variances.
# I will be really honest and excluded the cases, where both medians differed by more than my machine epsilon for double precision, OK?
# It's a really SMALL number!
options(scipen = 16)
.Machine$double.eps
# [1] 0.0000000000000002220446

# Ready?

set.seed(1234)
N <- 200
failed_comparisons <- 0

sapply(seq(N), function(i) {

   x1 <- rlnorm(500, 0, 1)
   variance_factor <- 6
   median_x1 <- median(x1)
   scaled_dev <- (x1 - median_x1) * sqrt(variance_factor)
   x2 <- (median_x1 + scaled_dev)

   diff_med <- median(x1) - median(x2)

   if(diff_med <= .Machine$double.eps) {
     wilcox.test(x1, x2)$p.value
   } else {
     print(paste("Ouch! Medians differ by:", diff_med, "which is more than my machine epsilon"))
     failed_comparisons <<- failed_comparisons + 1
   }
}) -> sim_p

# [1] "Ouch! Medians differ by: 0.000000000000000444089209850063 which is more than my machine epsilon"
# [1] "Ouch! Medians differ by: 0.000000000000000444089209850063 which is more than my machine epsilon"
# [1] "Ouch! Medians differ by: 0.000000000000000444089209850063 which is more than my machine epsilon"
# [1] "Ouch! Medians differ by: 0.000000000000000444089209850063 which is more than my machine epsilon"
# [1] "Ouch! Medians differ by: 0.000000000000000444089209850063 which is more than my machine epsilon"
# [1] "Ouch! Medians differ by: 0.000000000000000444089209850063 which is more than my machine epsilon"
# [1] "Ouch! Medians differ by: 0.000000000000000444089209850063 which is more than my machine epsilon"

sprintf("Fraction of rejections for same medians: %d / %d (%.2f%%)",
         sum(sim_p <= 0.05),
         N-failed_comparisons,
         100*(sum(sim_p <= 0.05) / (N-failed_comparisons)))

# [1] "Fraction of rejections for same medians: 64 / 193 (33.16%)"

# OUCH! This is FAR beyond "a well controlled type-1 error rate"
# Remember, we did this at honestly practically identical medians
	# Let's make some data to play with

	set.seed(1234)
	v1 <- rexp(500)
	v2 <- rnorm(500) + log(2)
	v3 <- -rgamma(500, 2.5, 3)
	v4 <- runif(500, -2,4)

	# Look at the data
	layout(matrix(c(1:4), nrow = 2))
	hist(v1, xlim=c(-3, 5))
	abline(v = median(v1), col="red", lwd=2)
	hist(v2, xlim=c(-3, 5));
	abline(v = median(v2), col="red", lwd=2)
	hist(v3, xlim=c(-3, 5));
	abline(v = median(v3), col="red", lwd=2)
	hist(v4, xlim=c(-3, 5));
	abline(v = median(v4), col="red", lwd=2)

	# I will start from the quantile regression. Actually it doesn't matter where we will start.

	# For a 2-level categorical predictor variables it's equivalent to comapre 2 medians (test for their differences).
	# Here I use bootstrap to calculate the standard error.
	# It's probably the safest option at this stage, and relaxes me from justifying the other options (believe me or not, it would be a complex discussion...)

	stacked <- stack(data.frame(v1, v2))
	summary(rq(values~ind, data=stacked), se="boot", R=9999)

	# Call: rq(formula = values ~ ind, data = stacked)
	#
	# tau: [1] 0.5
	#
	# Coefficients:
	# Value Std. Error t value Pr(>\|t\|)
	# (Intercept) 0.68712 0.04500 15.26806 0.00000
	# indv2 -0.03418 0.06435 -0.53120 0.59540 <--- you can use this test (Value coef. is the difference)
	# Warning message:
	# In rq.fit.br(x, y, tau = tau, ...) : Solution may be nonunique

	# EM-means (estimated marginal means) is another name of the well-known in experimental research LS-means (least-square means)
	# It's a model-based predicted (estimated) mean. If you know the definition of regression (NO, not the Machine Learning one...)
	# then you know that regresion gives you a some function of the data conditional to the predictor.
	# For the linear regression it's E(Y\|X=x), for the GLM it is link(E(Y\|X=x)), and here it's the median(Y\|X=x).
	# And since the predictor exclusively consists of categorical variables, they form sub-groups in which the (conditional)
	# medians are calculated. Ignore the labelling - emmeans here just stand for predicted medians (check numerically if you don't believe me).

	(emmeans_model <- emmeans::emmeans(rq(values~ind, data=stacked), ~ind))
	# ind emmean SE df lower.CL upper.CL
	# v1 0.687 0.0441 998 0.601 0.774
	# v2 0.653 0.0505 998 0.554 0.752

	# Confidence level used: 0.95
	# Warning message:
	# In rq.fit.br(x, y, tau = tau, ...) : Solution may be nonunique

	# Compare the obtainedm edians. Effectively it's nothing but a between-group comparison of medians.

	pairs(emmeans_model)
	# contrast estimate SE df t.ratio p.value
	# v1 - v2 0.0342 0.0671 998 0.510 0.6105 <- (minor difference from the above Wald's test for the model coeff.; testing over predictions)

	# For 2+ groups we can test the coefficients jointly. If we wored with the linear regression model, this would result 1:1 in ANOVA.
	stacked <- stack(data.frame(v1, v2, v3, v4))

	emmeans_model <- emmeans::emmeans(rq(values~ind, data=stacked), ~ind))
	# ind emmean SE df lower.CL upper.CL
	# v1 0.689 0.0440 1996 0.603 0.775
	# v2 0.653 0.0480 1996 0.559 0.747
	# v3 -0.725 0.0254 1996 -0.775 -0.675
	# v4 1.038 0.1470 1996 0.750 1.326

	# Confidence level used: 0.95
	# Warning message:
	# In rq.fit.br(x, y, tau = tau, ...) : Solution may be nonunique

	# ANOVA-like table (joint Wald's test); For n-way design this would be the Type-3 analysis
	(qrj_test <- emmeans::joint_tests(rq(values~ind, data=stacked)))
	# model term df1 df2 F.ratio p.value
	# ind 3 1996 407.068 <.0001

	# Warning message:
	# In rq.fit.br(x, y, tau = tau, ...) : Solution may be nonunique

	# BTW, we can perform various comparisons via vontrasts; here adjusted for multiplicity
	# mvt = exact parametric adjustment employing multi-variate t distribution + estimates of the coefficients and variance-covariance(s)
	# (numbers may vary a little due to Monte-Carlo integration over the multivariate t distribution; fix the seed or ignore it).

	update(emmeans::contrast(emmeans_model, list(v1_v2 = c(1, -1, 0, 0), v2_v4 = c(0, 1, 0, -1)), adjust = "mvt"), infer = c(TRUE, TRUE))
	# contrast estimate SE df lower.CL upper.CL t.ratio p.value
	# v1_v2 0.0363 0.0651 1996 -0.109 0.1816 0.558 0.8168
	# v2_v4 -0.3852 0.1546 1996 -0.730 -0.0399 -2.492 0.0252

	# Confidence level used: 0.95
	# Conf-level adjustment: mvt method for 2 estimates
	# P value adjustment: mvt method for 2 tests

	******************************************************************************************
	# Now the funny part. Let's compare v1 and v2 via quantile regression and Mann-Whitney (Wilcoxon) test
	# Notice the difference. MW(W) is about pseudo-median of change. QR is about change in medians

	# Difference in medians
	median(v1) - median(v2)
	# [1] 0.0367213

	# The quantile regression says:
	update(emmeans::contrast(emmeans_model, list(v1_v2 = c(1, -1, 0, 0)), infer = c(TRUE, TRUE)))
	# contrast estimate SE df lower.CL upper.CL t.ratio p.value
	# v1_v2 0.0363 0.0651 1996 -0.0913 0.164 0.558 0.5766

	# Confidence level used: 0.95

	# No statistically significant difference (p=0.577)
	# Let's confirm it via Brown-Mood test of medians (do NOT confuse it with R's mood.test, which is for DISPERSIONS (variances more/less)!)

	stacked <- stack(data.frame(v1, v2))
	RVAideMemoire::mood.medtest(values~ind, stacked, exact=FALSE)

	# Mood's median test
	#
	# data: values by ind
	# X-squared = 0.484, df = 1, p-value = 0.4866

	# The test of medians returned p=0.487, quite close to the quantile regression.
	# No surprise - they test almost the same (just through different formulation)

	# Now let's calculate the median difference
	median(v1-v2)
	#[1] 0.237302

	# Wow! This is quite different from the difference in medians.
	# What works for means, i.e. mean(difference) = difference(means) doesn't work (in general) for medians.

	# Is this maybe the Hodges-Lehmann (location) estimator of the pseudo-median?
	median(outer(v1, v2, "-")) # pseudomedian for the 2-sample case (https://aakinshin.net/posts/r-hodges-lehmann-problems/)
	#[1] 0.2063202

	# Nope. They're close, but not exactly same (if they were same, there wouldn't be a different name :D)

	# Notice it's the same as the MWW returns
	> broom::tidy(wilcox.test(v1, v2, conf.int = TRUE))
	# A tibble: 1 × 7
	# estimate statistic p.value conf.low conf.high method alternative
	# <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
	# 1 0.206 141300 0.000358 0.0917 0.324 Wilcoxon rank sum test with continuity correction two.sided

	# ******************************************************************************************
	# The Mann-Whitney (Wilcoxon) found statistically significant difference at p<0.001 (!)
	# We can clearly see that Mann-Whitney is definitely NOT a test of difference in medians!
	# It tests H0: stochastic equivalence vs. H1: stochastic dominance (superiority).
	# Look at the histogram of v1 and v2 and you should see the point.
	# Notice the medians are almost the same, but the "dominance" is visible on the skewed one.
	# ******************************************************************************************

	# Confirm Mann-Whitney with its better version, the Brunner-Munzel test (more robust to between-group dispersions)

	tidy(brunnermunzel::brunnermunzel.permutation.test(v1, v2))
	# A tibble: 1 × 7
	# estimate statistic p.value parameter conf.low.lower conf.high.upper method
	# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
	# 1 0.435 -3.57 0.000379 918. 0.399 0.471 Brunner-Munzel Test
	# Warning message:
	# In brunnermunzel.permutation.test.default(v1, v2) :
	# Sample number is too large. Using 'brunnermunzel.test'

	# If you run the test without broom::tidy, you'll get more information. Read it, it's very important:

	# Brunner-Munzel Test
	#
	# data: x and y
	# Brunner-Munzel Test Statistic = -3.5673, df = 918.41, p-value = 0.0003793
	# 95 percent confidence interval:
	# 0.3989302 0.4706698
	# sample estimates:
	# P(X<Y)+.5*P(X=Y)
	# 0.4348

	# As you can see, the H0 about statistical equivalence (probability that for a pair of randomly chosen observations {v1, v2}
	# from samples V1 and V2, the value of v1 <= value of v2 is < 0.5 and its confidence interval excludes 0.5.

	# Similar analysis of the RTE (Relative Treatment Effect, related to the stochastic dominance) using the ATS (ANOVA-Type Statistic)
	# and WTS (Wald-Type Statistic) confirms this finding

	GFD::GFD(values~ind, stacked)
	# Call:
	# values ~ ind

	# Wald-Type Statistic (WTS):
	# Test statistic df p-value p-value WTPS
	# 2.941036e+01 1.000000e+00 5.856387e-08 0.000000e+00
	#
	# ANOVA-Type Statistic (ATS):
	# Test statistic df1 df2 p-value
	# 2.941036e+01 1.000000e+00 9.917439e+02 7.357885e-08

	# ******************************************************************************************
	# Now I'm going to show even a more extreme scenario!
	# Same kind of right-skewness (log-normal), with numericall exact medians, differing only by dispersions
	# On the list in my comment below find a paper saying that variance matters too.
	# Notice the result of the MWW and quantile regression (and Brown-Mood test)

	set.seed(1234)
	v5 <- rlnorm(500, 0, 1)

	# We will scale the data to increase dispersion but keep the medians
	variance_factor <- 6
	median_v5 <- median(v5)
	scaled_dev <- (v5 - median_v5) * sqrt(variance_factor)
	v6 <- (median_v5 + scaled_dev)

	# Are the variances diffent?
	var(v5)
	# [1] 5.718385
	var(v6)
	# [1] 34.31031

	# What the various (common) tests say about it?
	tidy(var.test(v5, v6))
	# Multiple parameters; naming those columns num.df, den.df
	# A tibble: 1 × 9
	# estimate num.df den.df statistic p.value conf.low conf.high method alternative
	# <dbl> <int> <int> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
	#1 0.167 499 499 0.167 2.27e-79 0.140 0.199 F test to compare two variances two.sided

	tidy(ansari.test(v5, v6))
	# A tibble: 1 × 4
	# statistic p.value method alternative
	# <dbl> <dbl> <chr> <chr>
	# 1 157454 0 Ansari-Bradley test two.sided

	tidy(mood.test(v5, v6))
	# A tibble: 1 × 4
	# statistic p.value method alternative
	# <dbl> <dbl> <chr> <chr>
	# 1 -14.9 5.81e-50 Mood two-sample test of scale two.sided

	tidy(fligner.test(list(v5, v6)))
	# A tibble: 1 × 4
	# statistic p.value parameter method
	# <dbl> <dbl> <dbl> <chr>
	# 1 129. 8.26e-30 1 Fligner-Killeen test of homogeneity of variances

	# Convinced enough about different dispersions?
	# Now let's check the medians. Both diffence(medians) and median(difference)

	median(v5) - median(v6)
	# [1] 0

	median(v5 - v6)
	# [1] 0

	# So, if the Mann-Whitney (-Wilcoxon) tests medians, the p-value should be practically equal to 1. Shouldn't it?
	tidy(wilcox.test(v5, v6, conf.int = TRUE))
	# A tibble: 1 × 7
	# estimate statistic p.value conf.low conf.high method alternative
	# <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
	# 1 0.254 135056 0.0277 0.0293 0.455 Wilcoxon rank sum test with continuity correction two.sided

	# OUCH!

	# Let's use the Brunner-Munzel test accounting for unequal dispersions:
	tidy(brunnermunzel::brunnermunzel.test(v5, v6))
	# A tibble: 1 × 7
	# estimate statistic p.value parameter conf.low.lower conf.high.upper method
	# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
	# 1 0.460 -2.08 0.0384 640. 0.422 0.498 Brunner-Munzel Test

	# Better (as it should be), but still we have a problem, Houston!

	# And what does the quantile regrsssion say?

	> # Quantile regression - notice how the p-value is close to 1
	stacked <- stack(data.frame(v5, v6))
	summary(rq(values~ind, data=stacked), se="boot", R=9999)

	# Call: rq(formula = values ~ ind, data = stacked)
	#
	# tau: [1] 0.5
	#
	# Coefficients:
	# Value Std. Error t value Pr(>\|t\|)
	# (Intercept) 0.97407 0.05720 17.02877 0.00000
	# indv6 0.01882 0.14777 0.12734 0.89870
	# Warning message:
	# In rq.fit.br(x, y, tau = tau, ...) : Solution may be nonunique

	# p-value = 0.899, well...

	# ... but maybe the Brown-Mood test of medians...?

	tidy(RVAideMemoire::mood.medtest(values~ind, stacked, exact=FALSE))
	# A tibble: 1 × 4
	# statistic p.value parameter method
	# <dbl> <dbl> <int> <chr>
	# 1 0 1 1 Mood's median test

	# nope. The medians were equal and the tests OF MEDIANS can see it even at this sample size (N=500 per group).

	# ******************************************************************************************
	# OK, but it could be just a type-1 error. If it's controlled by the test at some low rate, like 5%, it's OK!

	# Is it?

	# I will evaluate 200 examples with same medians and different variances.
	# I will be really honest and excluded the cases, where both medians differed by more than my machine epsilon for double precision, OK?
	# It's a really SMALL number!
	options(scipen = 16)
	.Machine$double.eps
	# [1] 0.0000000000000002220446

	# Ready?

	set.seed(1234)
	N <- 200
	failed_comparisons <- 0

	sapply(seq(N), function(i) {

	x1 <- rlnorm(500, 0, 1)
	variance_factor <- 6
	median_x1 <- median(x1)
	scaled_dev <- (x1 - median_x1) * sqrt(variance_factor)
	x2 <- (median_x1 + scaled_dev)

	diff_med <- median(x1) - median(x2)

	if(diff_med <= .Machine$double.eps) {
	wilcox.test(x1, x2)$p.value
	} else {
	print(paste("Ouch! Medians differ by:", diff_med, "which is more than my machine epsilon"))
	failed_comparisons <<- failed_comparisons + 1
	}
	}) -> sim_p

	# [1] "Ouch! Medians differ by: 0.000000000000000444089209850063 which is more than my machine epsilon"
	# [1] "Ouch! Medians differ by: 0.000000000000000444089209850063 which is more than my machine epsilon"
	# [1] "Ouch! Medians differ by: 0.000000000000000444089209850063 which is more than my machine epsilon"
	# [1] "Ouch! Medians differ by: 0.000000000000000444089209850063 which is more than my machine epsilon"
	# [1] "Ouch! Medians differ by: 0.000000000000000444089209850063 which is more than my machine epsilon"
	# [1] "Ouch! Medians differ by: 0.000000000000000444089209850063 which is more than my machine epsilon"
	# [1] "Ouch! Medians differ by: 0.000000000000000444089209850063 which is more than my machine epsilon"

	sprintf("Fraction of rejections for same medians: %d / %d (%.2f%%)",
	sum(sim_p <= 0.05),
	N-failed_comparisons,
	100*(sum(sim_p <= 0.05) / (N-failed_comparisons)))

	# [1] "Fraction of rejections for same medians: 64 / 193 (33.16%)"

	# OUCH! This is FAR beyond "a well controlled type-1 error rate"
	# Remember, we did this at honestly practically identical medians