Skip to content

Instantly share code, notes, and snippets.

@rwalk

rwalk/goodreads_matrix.py

Last active Jun 8, 2020
Embed
What would you like to do?
Spare matrix representation for goodreads data
import json
from scipy.sparse import coo_matrix, save_npz
import numpy as np
print("reading data...")
with open("goodreads/interactions.json") as f:
data = json.load(f)
books = data["books"]
interactions = data["interactions"]
print("building matrix...")
# setup for COO matrix builder
data, I, J = [], [], []
max_col = 0
for i,row in enumerate(interactions):
for col in row:
data.append(1.0)
I.append(i)
J.append(col)
if col>max_col:
# we need the max columnn index so we may know the size of matrix
max_col = col
U = coo_matrix((data, (I, J)),
shape=(i+1, max_col+1),
dtype=np.float64
).tocsr()
# save raw sparse matrix
print("saving to disk...")
save_npz("goodreads/U.npz", U)
print("cleaning up...")
del interactions, data, I, J
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.