Spare matrix representation for goodreads data
import json | |
from scipy.sparse import coo_matrix, save_npz | |
import numpy as np | |
print("reading data...") | |
with open("goodreads/interactions.json") as f: | |
data = json.load(f) | |
books = data["books"] | |
interactions = data["interactions"] | |
print("building matrix...") | |
# setup for COO matrix builder | |
data, I, J = [], [], [] | |
max_col = 0 | |
for i,row in enumerate(interactions): | |
for col in row: | |
data.append(1.0) | |
I.append(i) | |
J.append(col) | |
if col>max_col: | |
# we need the max columnn index so we may know the size of matrix | |
max_col = col | |
U = coo_matrix((data, (I, J)), | |
shape=(i+1, max_col+1), | |
dtype=np.float64 | |
).tocsr() | |
# save raw sparse matrix | |
print("saving to disk...") | |
save_npz("goodreads/U.npz", U) | |
print("cleaning up...") | |
del interactions, data, I, J |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment