Skip to content

Instantly share code, notes, and snippets.

@CooperBond
Created February 19, 2019 15:28
Show Gist options
  • Save CooperBond/fc146702f395c10356c083a8e2bcaab8 to your computer and use it in GitHub Desktop.
Save CooperBond/fc146702f395c10356c083a8e2bcaab8 to your computer and use it in GitHub Desktop.
Matrix factorization examples
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD as SVD
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import explained_variance_score as evs
from sklearn.metrics import mean_squared_error as mse
V = np.array([[0,1,0,1,2,2],
[2,3,1,1,2,2],
[1,1,1,0,1,1],
[0,2,3,4,1,1],
[0,0,0,0,1,0]])
V = pd.DataFrame(V, index = ('Овощи', 'Фрукты', 'Сладости', 'Хлеб', 'Кофе'),
columns = ('Миша', 'Маша' ,'Рома', 'Дима', 'Витя', 'Вова'))
def reconstruct(model, data):
#input: model - sklearn model, data - pandas DataFrame
#returns pandas DataFrame of reconstructed matrix
model.fit(data)
W = model.transform(data)
H = model.components_
reconstructed = pd.DataFrame(np.round(np.dot(W, H), 2), columns=data.columns, index=data.index)
# next line use in case of SVD or PCA decomposition to substitute negative elements for zeros
# reconstructed[reconstructed < 0] = 0
new_data = []
# interpolation of reconstructed matrix to range [0,1]
for i in range(0, reconstructed.shape[0]):
r = reconstructed.loc[reconstructed.index[i]]
rd = np.interp(r, (r.min(), r.max()), (0, +1))
new_data.append(rd)
interpolated = pd.DataFrame(np.round(new_data, 2), index=reconstructed.index, columns=reconstructed.columns)
return reconstructed , interpolated
def sparsity(data):
return 1.0 - np.count_nonzero(data) / data.size
def rel_spars(data):
return 1 - (sparsity(data) / sparsity(V))
def evaluate_plot(model, data):
EVS = []
MSE = []
MAE = []
SPARS = []
ks = [1, 2, 3, 4, 5]
for k in ks:
MAE.append(mae(data, reconstruct(model(n_components=k), data)[0]))
MSE.append((mse(data, reconstruct(model(n_components=k), data)[0])))
EVS.append(evs(data, reconstruct(model(n_components=k), data)[0]))
SPARS.append(rel_spars(reconstruct(model(n_components=k), data)[0]))
plt.xlabel('N - components')
plt.ylabel('Value')
plt.plot(MSE)
plt.plot(MAE)
plt.plot(EVS)
plt.plot(SPARS)
METRIC = np.round(MSE, 1)
for i in range(0, len((METRIC))):
if METRIC[i] == METRIC[i + 1]:
plt.scatter(i, MSE[i], c='red')
plt.scatter(i, EVS[i], c='red')
plt.scatter(i, SPARS[i], c='green')
plt.text(i, EVS[i] + 0.01, str(np.round(EVS[i], 3)))
plt.text(i, SPARS[i] + 0.01, str(np.round(SPARS[i], 3)))
plt.vlines(i, 0, 1, colors='red')
break
plt.legend(('RMSE', 'MAE', 'EVS', 'SPARSITY'), loc='best')
plt.xticks([0, 1, 2, 3, 4], ks)
plt.show()
model = NMF
# or use other models to see result
#model = PCA
#model = SVD
#usage
evaluate_plot(model,V)
print(reconstruct(model,V)[0]) #reconstructed
print(reconstruct(model,V)[1]) #interpolated
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment