ogrisel/README.md

## README.md

      
    Raw
  

              README.md
            
          
    This is an experiment to highlight the dependency of the V-Measure value
on the number of clusters of 2 independent uniform labelings for a finite
number of samples.
Intuitively it seems that for finite number of samples the V-Measure is
victim of some kind of birthday paradox that naive users might not be
aware of.
Even if the maximum number of clusters considered (e.g. 10) is small
with respect to the number of samples (e.g. 5000), the V-Measure of
2 independent uniform labeling is still noticibly increasing with the
number of clusters in at least one of the labeling.
Careful counting number of possible combinations is required to be able
to come up with an adjusted for chance variant of V-Measure that would
counter this effect.

  
## uniform_n_samples_100.png

      
    Raw
  

              uniform_n_samples_100.png
            
          
## uniform_n_samples_100_k_a_20.png

      
    Raw
  

              uniform_n_samples_100_k_a_20.png
            
          
## uniform_n_samples_5000.png

      
    Raw
  

              uniform_n_samples_5000.png
            
          
## uniform_n_samples_5000_k_a_5.png

      
    Raw
  

              uniform_n_samples_5000_k_a_5.png
            
          
## v_measure_uniform.py
"""Script demonstrating the lack of adjustment for randomness of V-Measure

Here we make 2 independent clustering labels labels_a and labels_b for various
values if n_samples and k the number of clusters in both the a and b labelings.

"""
import numpy as np

from sklearn.metrics import v_measure_score


def v_measures_same_k(n_samples=100, k_range=range(2, 100), n_runs=10,
                      seed=42):
    random_labels = np.random.RandomState(seed).random_integers
    scores = np.zeros((len(k_range), n_runs))
    for i in range(n_runs):
        for j, k in enumerate(k_range):
            labels_a = random_labels(low=0, high=k - 1, size=n_samples)
            labels_b = random_labels(low=0, high=k - 1, size=n_samples)
            scores[j, i] = v_measure_score(labels_a, labels_b)
    return scores


def v_measures_fixed_k_a(n_samples=100, k_a=10, k_b_range=range(2, 100),
                         n_runs=10, seed=42):
    random_labels = np.random.RandomState(seed).random_integers
    scores = np.zeros((len(k_b_range), n_runs))
    for i in range(n_runs):
        for j, k_b in enumerate(k_b_range):
            labels_a = random_labels(low=0, high=k_a - 1, size=n_samples)
            labels_b = random_labels(low=0, high=k_b - 1, size=n_samples)
            scores[j, i] = v_measure_score(labels_a, labels_b)
    return scores


if __name__ == '__main__':
    import pylab as pl

    n_samples = 100
    k_range = range(2, n_samples + 1)

    scores = v_measures_same_k(n_samples=n_samples, k_range=k_range)
    mean = scores.mean(axis=1)
    std = scores.std(axis=1)
    pl.errorbar(k_range, mean, yerr=std)
    pl.title("V-Measures for 2 uniform labelings with various centers\n"
             "and fixed total number of samples to label %d." % n_samples)
    pl.show()

    k_a = 20
    scores = v_measures_fixed_k_a(n_samples=n_samples, k_a=k_a, k_b_range=k_range)
    mean = scores.mean(axis=1)
    std = scores.std(axis=1)
    pl.errorbar(k_range, mean, yerr=std)
    pl.title("V-Measures for 2 uniform labelings, one with various centers\n"
             "and fixed number of centers %d for the other and\n"
             "total number of samples to label %d." % (k_a, n_samples))
    pl.show()
	"""Script demonstrating the lack of adjustment for randomness of V-Measure

	Here we make 2 independent clustering labels labels_a and labels_b for various
	values if n_samples and k the number of clusters in both the a and b labelings.

	"""
	import numpy as np

	from sklearn.metrics import v_measure_score


	def v_measures_same_k(n_samples=100, k_range=range(2, 100), n_runs=10,
	seed=42):
	random_labels = np.random.RandomState(seed).random_integers
	scores = np.zeros((len(k_range), n_runs))
	for i in range(n_runs):
	for j, k in enumerate(k_range):
	labels_a = random_labels(low=0, high=k - 1, size=n_samples)
	labels_b = random_labels(low=0, high=k - 1, size=n_samples)
	scores[j, i] = v_measure_score(labels_a, labels_b)
	return scores


	def v_measures_fixed_k_a(n_samples=100, k_a=10, k_b_range=range(2, 100),
	n_runs=10, seed=42):
	random_labels = np.random.RandomState(seed).random_integers
	scores = np.zeros((len(k_b_range), n_runs))
	for i in range(n_runs):
	for j, k_b in enumerate(k_b_range):
	labels_a = random_labels(low=0, high=k_a - 1, size=n_samples)
	labels_b = random_labels(low=0, high=k_b - 1, size=n_samples)
	scores[j, i] = v_measure_score(labels_a, labels_b)
	return scores


	if __name__ == '__main__':
	import pylab as pl

	n_samples = 100
	k_range = range(2, n_samples + 1)

	scores = v_measures_same_k(n_samples=n_samples, k_range=k_range)
	mean = scores.mean(axis=1)
	std = scores.std(axis=1)
	pl.errorbar(k_range, mean, yerr=std)
	pl.title("V-Measures for 2 uniform labelings with various centers\n"
	"and fixed total number of samples to label %d." % n_samples)
	pl.show()

	k_a = 20
	scores = v_measures_fixed_k_a(n_samples=n_samples, k_a=k_a, k_b_range=k_range)
	mean = scores.mean(axis=1)
	std = scores.std(axis=1)
	pl.errorbar(k_range, mean, yerr=std)
	pl.title("V-Measures for 2 uniform labelings, one with various centers\n"
	"and fixed number of centers %d for the other and\n"
	"total number of samples to label %d." % (k_a, n_samples))
	pl.show()