Skip to content

Instantly share code, notes, and snippets.

@rwalk
Last active June 8, 2020 03:10
Show Gist options
  • Save rwalk/ffca9efe6cb11bb466fd0aa699175292 to your computer and use it in GitHub Desktop.
Save rwalk/ffca9efe6cb11bb466fd0aa699175292 to your computer and use it in GitHub Desktop.
Spare matrix representation for goodreads data
import json
from scipy.sparse import coo_matrix, save_npz
import numpy as np
print("reading data...")
with open("goodreads/interactions.json") as f:
data = json.load(f)
books = data["books"]
interactions = data["interactions"]
print("building matrix...")
# setup for COO matrix builder
data, I, J = [], [], []
max_col = 0
for i,row in enumerate(interactions):
for col in row:
data.append(1.0)
I.append(i)
J.append(col)
if col>max_col:
# we need the max columnn index so we may know the size of matrix
max_col = col
U = coo_matrix((data, (I, J)),
shape=(i+1, max_col+1),
dtype=np.float64
).tocsr()
# save raw sparse matrix
print("saving to disk...")
save_npz("goodreads/U.npz", U)
print("cleaning up...")
del interactions, data, I, J
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment