mehdidc/intdim_mle.py

## intdim_mle.py
"""
Implementation of 'Maximum Likelihood Estimation of Intrinsic Dimension' by Elizaveta Levina and Peter J. Bickel

how to use
----------

The goal is to estimate intrinsic dimensionality of data, the estimation of dimensionality is scale dependent
(depending on how much you zoom into the data distribution you can find different dimesionality), so they
propose to average it over different scales, the interval of the scales [k1, k2] are the only parameters of the algorithm.

This code also provides a way to repeat the estimation with bootstrapping to estimate uncertainty.

Here is one example with swiss roll :

from sklearn.datasets import make_swiss_roll
X, _ = make_swiss_roll(1000)

k1 = 10 # start of interval(included)
k2 = 20 # end of interval(included)
intdim_k_repeated = repeated(intrinsic_dim_scale_interval,
                             X,
                             mode='bootstrap',
                             nb_iter=500, # nb_iter for bootstrapping
                             verbose=1,
                             k1=k1, k2=k2)
intdim_k_repeated = np.array(intdim_k_repeated)
# the shape of intdim_k_repeated is (nb_iter, size_of_interval) where
# nb_iter is number of bootstrap iterations (here 500) and size_of_interval
# is (k2 - k1 + 1).

# Plotting the histogram of intrinsic dimensionality estimations repeated over
# nb_iter experiments
plt.hist(intdim_k_repeated.mean(axis=1))

"""
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

def intrinsic_dim_sample_wise(X, k=5):
    neighb = NearestNeighbors(n_neighbors=k + 1).fit(X)
    dist, ind = neighb.kneighbors(X)
    dist = dist[:, 1:]
    dist = dist[:, 0:k]
    assert dist.shape == (X.shape[0], k)
    assert np.all(dist > 0)
    d = np.log(dist[:, k - 1: k] / dist[:, 0:k-1])
    d = d.sum(axis=1) / (k - 2)
    d = 1. / d
    intdim_sample = d
    return intdim_sample

def intrinsic_dim_scale_interval(X, k1=10, k2=20):
    X = pd.DataFrame(X).drop_duplicates().values # remove duplicates in case you use bootstrapping
    intdim_k = []
    for k in range(k1, k2 + 1):
        m = intrinsic_dim_sample_wise(X, k).mean()
        intdim_k.append(m)
    return intdim_k

def repeated(func, X, nb_iter=100, random_state=None, verbose=0, mode='bootstrap', **func_kw):
    if random_state is None:
        rng = np.random
    else:
        rng = np.random.RandomState(random_state)
    nb_examples = X.shape[0]
    results = []

    iters = range(nb_iter)
    if verbose > 0:
        iters = tqdm(iters)
    for i in iters:
        if mode == 'bootstrap':
            Xr = X[rng.randint(0, nb_examples, size=nb_examples)]
        elif mode == 'shuffle':
            ind = np.arange(nb_examples)
            rng.shuffle(ind)
            Xr = X[ind]
        elif mode == 'same':
            Xr = X
        else:
            raise ValueError('unknown mode : {}'.format(mode))
        results.append(func(Xr, **func_kw))
    return results
	"""
	Implementation of 'Maximum Likelihood Estimation of Intrinsic Dimension' by Elizaveta Levina and Peter J. Bickel

	how to use
	----------

	The goal is to estimate intrinsic dimensionality of data, the estimation of dimensionality is scale dependent
	(depending on how much you zoom into the data distribution you can find different dimesionality), so they
	propose to average it over different scales, the interval of the scales [k1, k2] are the only parameters of the algorithm.

	This code also provides a way to repeat the estimation with bootstrapping to estimate uncertainty.

	Here is one example with swiss roll :

	from sklearn.datasets import make_swiss_roll
	X, _ = make_swiss_roll(1000)

	k1 = 10 # start of interval(included)
	k2 = 20 # end of interval(included)
	intdim_k_repeated = repeated(intrinsic_dim_scale_interval,
	X,
	mode='bootstrap',
	nb_iter=500, # nb_iter for bootstrapping
	verbose=1,
	k1=k1, k2=k2)
	intdim_k_repeated = np.array(intdim_k_repeated)
	# the shape of intdim_k_repeated is (nb_iter, size_of_interval) where
	# nb_iter is number of bootstrap iterations (here 500) and size_of_interval
	# is (k2 - k1 + 1).

	# Plotting the histogram of intrinsic dimensionality estimations repeated over
	# nb_iter experiments
	plt.hist(intdim_k_repeated.mean(axis=1))

	"""
	from tqdm import tqdm
	import pandas as pd
	import numpy as np
	from sklearn.neighbors import NearestNeighbors

	def intrinsic_dim_sample_wise(X, k=5):
	neighb = NearestNeighbors(n_neighbors=k + 1).fit(X)
	dist, ind = neighb.kneighbors(X)
	dist = dist[:, 1:]
	dist = dist[:, 0:k]
	assert dist.shape == (X.shape[0], k)
	assert np.all(dist > 0)
	d = np.log(dist[:, k - 1: k] / dist[:, 0:k-1])
	d = d.sum(axis=1) / (k - 2)
	d = 1. / d
	intdim_sample = d
	return intdim_sample

	def intrinsic_dim_scale_interval(X, k1=10, k2=20):
	X = pd.DataFrame(X).drop_duplicates().values # remove duplicates in case you use bootstrapping
	intdim_k = []
	for k in range(k1, k2 + 1):
	m = intrinsic_dim_sample_wise(X, k).mean()
	intdim_k.append(m)
	return intdim_k

	def repeated(func, X, nb_iter=100, random_state=None, verbose=0, mode='bootstrap', **func_kw):
	if random_state is None:
	rng = np.random
	else:
	rng = np.random.RandomState(random_state)
	nb_examples = X.shape[0]
	results = []

	iters = range(nb_iter)
	if verbose > 0:
	iters = tqdm(iters)
	for i in iters:
	if mode == 'bootstrap':
	Xr = X[rng.randint(0, nb_examples, size=nb_examples)]
	elif mode == 'shuffle':
	ind = np.arange(nb_examples)
	rng.shuffle(ind)
	Xr = X[ind]
	elif mode == 'same':
	Xr = X
	else:
	raise ValueError('unknown mode : {}'.format(mode))
	results.append(func(Xr, **func_kw))
	return results