Created
February 19, 2019 15:28
-
-
Save CooperBond/fc146702f395c10356c083a8e2bcaab8 to your computer and use it in GitHub Desktop.
Matrix factorization examples
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.decomposition import NMF | |
from sklearn.decomposition import PCA | |
from sklearn.decomposition import TruncatedSVD as SVD | |
from sklearn.metrics import mean_absolute_error as mae | |
from sklearn.metrics import explained_variance_score as evs | |
from sklearn.metrics import mean_squared_error as mse | |
V = np.array([[0,1,0,1,2,2], | |
[2,3,1,1,2,2], | |
[1,1,1,0,1,1], | |
[0,2,3,4,1,1], | |
[0,0,0,0,1,0]]) | |
V = pd.DataFrame(V, index = ('Овощи', 'Фрукты', 'Сладости', 'Хлеб', 'Кофе'), | |
columns = ('Миша', 'Маша' ,'Рома', 'Дима', 'Витя', 'Вова')) | |
def reconstruct(model, data): | |
#input: model - sklearn model, data - pandas DataFrame | |
#returns pandas DataFrame of reconstructed matrix | |
model.fit(data) | |
W = model.transform(data) | |
H = model.components_ | |
reconstructed = pd.DataFrame(np.round(np.dot(W, H), 2), columns=data.columns, index=data.index) | |
# next line use in case of SVD or PCA decomposition to substitute negative elements for zeros | |
# reconstructed[reconstructed < 0] = 0 | |
new_data = [] | |
# interpolation of reconstructed matrix to range [0,1] | |
for i in range(0, reconstructed.shape[0]): | |
r = reconstructed.loc[reconstructed.index[i]] | |
rd = np.interp(r, (r.min(), r.max()), (0, +1)) | |
new_data.append(rd) | |
interpolated = pd.DataFrame(np.round(new_data, 2), index=reconstructed.index, columns=reconstructed.columns) | |
return reconstructed , interpolated | |
def sparsity(data): | |
return 1.0 - np.count_nonzero(data) / data.size | |
def rel_spars(data): | |
return 1 - (sparsity(data) / sparsity(V)) | |
def evaluate_plot(model, data): | |
EVS = [] | |
MSE = [] | |
MAE = [] | |
SPARS = [] | |
ks = [1, 2, 3, 4, 5] | |
for k in ks: | |
MAE.append(mae(data, reconstruct(model(n_components=k), data)[0])) | |
MSE.append((mse(data, reconstruct(model(n_components=k), data)[0]))) | |
EVS.append(evs(data, reconstruct(model(n_components=k), data)[0])) | |
SPARS.append(rel_spars(reconstruct(model(n_components=k), data)[0])) | |
plt.xlabel('N - components') | |
plt.ylabel('Value') | |
plt.plot(MSE) | |
plt.plot(MAE) | |
plt.plot(EVS) | |
plt.plot(SPARS) | |
METRIC = np.round(MSE, 1) | |
for i in range(0, len((METRIC))): | |
if METRIC[i] == METRIC[i + 1]: | |
plt.scatter(i, MSE[i], c='red') | |
plt.scatter(i, EVS[i], c='red') | |
plt.scatter(i, SPARS[i], c='green') | |
plt.text(i, EVS[i] + 0.01, str(np.round(EVS[i], 3))) | |
plt.text(i, SPARS[i] + 0.01, str(np.round(SPARS[i], 3))) | |
plt.vlines(i, 0, 1, colors='red') | |
break | |
plt.legend(('RMSE', 'MAE', 'EVS', 'SPARSITY'), loc='best') | |
plt.xticks([0, 1, 2, 3, 4], ks) | |
plt.show() | |
model = NMF | |
# or use other models to see result | |
#model = PCA | |
#model = SVD | |
#usage | |
evaluate_plot(model,V) | |
print(reconstruct(model,V)[0]) #reconstructed | |
print(reconstruct(model,V)[1]) #interpolated |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment