Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Vchekryzhov/7b10ac161d9c3a0d61c57e209b7a8a05 to your computer and use it in GitHub Desktop.
Save Vchekryzhov/7b10ac161d9c3a0d61c57e209b7a8a05 to your computer and use it in GitHub Desktop.
Matrix factorization examples
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD as SVD
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import explained_variance_score as evs
from sklearn.metrics import mean_squared_error as mse
V = np.array([[0,1,0,1,2,2],
[2,3,1,1,2,2],
[1,1,1,0,1,1],
[0,2,3,4,1,1],
[0,0,0,0,1,0]])
V = pd.DataFrame(V, index = ('Овощи', 'Фрукты', 'Сладости', 'Хлеб', 'Кофе'),
columns = ('Миша', 'Маша' ,'Рома', 'Дима', 'Витя', 'Вова'))
def reconstruct(model, data):
#input: model - sklearn model, data - pandas DataFrame
#returns pandas DataFrame of reconstructed matrix
model.fit(data)
W = model.transform(data)
H = model.components_
reconstructed = pd.DataFrame(np.round(np.dot(W, H), 2), columns=data.columns, index=data.index)
# next line use in case of SVD or PCA decomposition to substitute negative elements for zeros
# reconstructed[reconstructed < 0] = 0
new_data = []
# interpolation of reconstructed matrix to range [0,1]
for i in range(0, reconstructed.shape[0]):
r = reconstructed.loc[reconstructed.index[i]]
rd = np.interp(r, (r.min(), r.max()), (0, +1))
new_data.append(rd)
interpolated = pd.DataFrame(np.round(new_data, 2), index=reconstructed.index, columns=reconstructed.columns)
return reconstructed , interpolated
def sparsity(data):
return 1.0 - np.count_nonzero(data) / data.size
def rel_spars(data):
return 1 - (sparsity(data) / sparsity(V))
def evaluate_plot(model, data):
EVS = []
MSE = []
MAE = []
SPARS = []
ks = [1, 2, 3, 4, 5]
for k in ks:
MAE.append(mae(data, reconstruct(model(n_components=k), data)[0]))
MSE.append((mse(data, reconstruct(model(n_components=k), data)[0])))
EVS.append(evs(data, reconstruct(model(n_components=k), data)[0]))
SPARS.append(rel_spars(reconstruct(model(n_components=k), data)[0]))
plt.xlabel('N - components')
plt.ylabel('Value')
plt.plot(MSE)
plt.plot(MAE)
plt.plot(EVS)
plt.plot(SPARS)
METRIC = np.round(MSE, 1)
for i in range(0, len((METRIC))):
if METRIC[i] == METRIC[i + 1]:
plt.scatter(i, MSE[i], c='red')
plt.scatter(i, EVS[i], c='red')
plt.scatter(i, SPARS[i], c='green')
plt.text(i, EVS[i] + 0.01, str(np.round(EVS[i], 3)))
plt.text(i, SPARS[i] + 0.01, str(np.round(SPARS[i], 3)))
plt.vlines(i, 0, 1, colors='red')
break
plt.legend(('RMSE', 'MAE', 'EVS', 'SPARSITY'), loc='best')
plt.xticks([0, 1, 2, 3, 4], ks)
plt.show()
model = NMF
# or use other models to see result
#model = PCA
#model = SVD
#usage
evaluate_plot(model,V)
print(reconstruct(model,V)[0]) #reconstructed
print(reconstruct(model,V)[1]) #interpolated
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment