Vchekryzhov/gist:7b10ac161d9c3a0d61c57e209b7a8a05

## gistfile1.txt
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD as SVD
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import explained_variance_score as evs
from sklearn.metrics import mean_squared_error as mse


V = np.array([[0,1,0,1,2,2],
              [2,3,1,1,2,2],
              [1,1,1,0,1,1],
              [0,2,3,4,1,1],
              [0,0,0,0,1,0]])
V = pd.DataFrame(V, index = ('Овощи', 'Фрукты', 'Сладости', 'Хлеб', 'Кофе'),
                columns = ('Миша', 'Маша' ,'Рома', 'Дима', 'Витя', 'Вова'))

def reconstruct(model, data):
#input: model - sklearn model, data - pandas DataFrame
#returns pandas DataFrame of reconstructed matrix
    model.fit(data)
    W = model.transform(data)
    H = model.components_
    reconstructed = pd.DataFrame(np.round(np.dot(W, H), 2), columns=data.columns, index=data.index)
    # next line use in case of SVD or PCA decomposition to substitute negative elements for zeros
    # reconstructed[reconstructed < 0] = 0
    new_data = []
    # interpolation of reconstructed matrix to range [0,1]
    for i in range(0, reconstructed.shape[0]):
        r = reconstructed.loc[reconstructed.index[i]]
        rd = np.interp(r, (r.min(), r.max()), (0, +1))
        new_data.append(rd)
    interpolated = pd.DataFrame(np.round(new_data, 2), index=reconstructed.index, columns=reconstructed.columns)
    return reconstructed , interpolated

def sparsity(data):
    return 1.0 - np.count_nonzero(data) / data.size


def rel_spars(data):
    return 1 - (sparsity(data) / sparsity(V))


def evaluate_plot(model, data):
    EVS = []
    MSE = []
    MAE = []
    SPARS = []
    ks = [1, 2, 3, 4, 5]
    for k in ks:
        MAE.append(mae(data, reconstruct(model(n_components=k), data)[0]))
        MSE.append((mse(data, reconstruct(model(n_components=k), data)[0])))
        EVS.append(evs(data, reconstruct(model(n_components=k), data)[0]))
        SPARS.append(rel_spars(reconstruct(model(n_components=k), data)[0]))
    plt.xlabel('N - components')
    plt.ylabel('Value')
    plt.plot(MSE)
    plt.plot(MAE)
    plt.plot(EVS)
    plt.plot(SPARS)
    METRIC = np.round(MSE, 1)
    for i in range(0, len((METRIC))):
        if METRIC[i] == METRIC[i + 1]:
            plt.scatter(i, MSE[i], c='red')
            plt.scatter(i, EVS[i], c='red')
            plt.scatter(i, SPARS[i], c='green')
            plt.text(i, EVS[i] + 0.01, str(np.round(EVS[i], 3)))
            plt.text(i, SPARS[i] + 0.01, str(np.round(SPARS[i], 3)))
            plt.vlines(i, 0, 1, colors='red')
            break
    plt.legend(('RMSE', 'MAE', 'EVS', 'SPARSITY'), loc='best')
    plt.xticks([0, 1, 2, 3, 4], ks)
    plt.show()


model = NMF
# or use other models to see result
#model = PCA
#model = SVD

#usage
evaluate_plot(model,V)
print(reconstruct(model,V)[0]) #reconstructed
print(reconstruct(model,V)[1]) #interpolated
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn.decomposition import NMF
	from sklearn.decomposition import PCA
	from sklearn.decomposition import TruncatedSVD as SVD
	from sklearn.metrics import mean_absolute_error as mae
	from sklearn.metrics import explained_variance_score as evs
	from sklearn.metrics import mean_squared_error as mse


	V = np.array([[0,1,0,1,2,2],
	[2,3,1,1,2,2],
	[1,1,1,0,1,1],
	[0,2,3,4,1,1],
	[0,0,0,0,1,0]])
	V = pd.DataFrame(V, index = ('Овощи', 'Фрукты', 'Сладости', 'Хлеб', 'Кофе'),
	columns = ('Миша', 'Маша' ,'Рома', 'Дима', 'Витя', 'Вова'))

	def reconstruct(model, data):
	#input: model - sklearn model, data - pandas DataFrame
	#returns pandas DataFrame of reconstructed matrix
	model.fit(data)
	W = model.transform(data)
	H = model.components_
	reconstructed = pd.DataFrame(np.round(np.dot(W, H), 2), columns=data.columns, index=data.index)
	# next line use in case of SVD or PCA decomposition to substitute negative elements for zeros
	# reconstructed[reconstructed < 0] = 0
	new_data = []
	# interpolation of reconstructed matrix to range [0,1]
	for i in range(0, reconstructed.shape[0]):
	r = reconstructed.loc[reconstructed.index[i]]
	rd = np.interp(r, (r.min(), r.max()), (0, +1))
	new_data.append(rd)
	interpolated = pd.DataFrame(np.round(new_data, 2), index=reconstructed.index, columns=reconstructed.columns)
	return reconstructed , interpolated

	def sparsity(data):
	return 1.0 - np.count_nonzero(data) / data.size


	def rel_spars(data):
	return 1 - (sparsity(data) / sparsity(V))


	def evaluate_plot(model, data):
	EVS = []
	MSE = []
	MAE = []
	SPARS = []
	ks = [1, 2, 3, 4, 5]
	for k in ks:
	MAE.append(mae(data, reconstruct(model(n_components=k), data)[0]))
	MSE.append((mse(data, reconstruct(model(n_components=k), data)[0])))
	EVS.append(evs(data, reconstruct(model(n_components=k), data)[0]))
	SPARS.append(rel_spars(reconstruct(model(n_components=k), data)[0]))
	plt.xlabel('N - components')
	plt.ylabel('Value')
	plt.plot(MSE)
	plt.plot(MAE)
	plt.plot(EVS)
	plt.plot(SPARS)
	METRIC = np.round(MSE, 1)
	for i in range(0, len((METRIC))):
	if METRIC[i] == METRIC[i + 1]:
	plt.scatter(i, MSE[i], c='red')
	plt.scatter(i, EVS[i], c='red')
	plt.scatter(i, SPARS[i], c='green')
	plt.text(i, EVS[i] + 0.01, str(np.round(EVS[i], 3)))
	plt.text(i, SPARS[i] + 0.01, str(np.round(SPARS[i], 3)))
	plt.vlines(i, 0, 1, colors='red')
	break
	plt.legend(('RMSE', 'MAE', 'EVS', 'SPARSITY'), loc='best')
	plt.xticks([0, 1, 2, 3, 4], ks)
	plt.show()


	model = NMF
	# or use other models to see result
	#model = PCA
	#model = SVD

	#usage
	evaluate_plot(model,V)
	print(reconstruct(model,V)[0]) #reconstructed
	print(reconstruct(model,V)[1]) #interpolated