Skip to content

Instantly share code, notes, and snippets.

@rwalk
Created June 2, 2020 04:59
Show Gist options
  • Save rwalk/b562c3d66628b516e9b184bf4f0f9df0 to your computer and use it in GitHub Desktop.
Save rwalk/b562c3d66628b516e9b184bf4f0f9df0 to your computer and use it in GitHub Desktop.
import json
import csv
from collections import defaultdict
# load the mapping (supplied by the dataset) from id into book_id
with open("goodreads/book_id_map.csv") as f:
reader = csv.reader(f)
_ = next(reader)
book_id_map = {int(_id): int(book_id) for _id, book_id in reader}
#
# FIRST PASS: read through all the interactions and count the number of books for each user
# and the number of times each book has been read
#
users_books_read = defaultdict(int)
books_read = defaultdict(int)
with open("goodreads/goodreads_interactions.csv") as f:
reader = csv.reader(f)
header = next(reader)
for row in reader:
user_id, _id, is_read, rating, is_reviewed = [int(col) for col in row]
if is_read:
users_books_read[user_id] += 1
books_read[book_id_map[_id]] += 1
print(f"Users: %s, books: %s" % (len(users_books_read), len(books_read)))
# expect: "Users: 876145, books: 2360650"
# Filter out all books read fewer than 3 times
books_read = {k:v for k,v in books_read.items() if v>2}
# Filter out users who have read less than 2 books
users_books_read = {k:v for k,v in users_books_read.items() if v>1}
# create book metadata
books = []
seq_id = 0
seq_book_map = {}
# load author info
authors = {}
with open("goodreads/goodreads_book_authors.json") as f:
for line in f:
author = json.loads(line)
author_id = author["author_id"]
authors[author_id] = author["name"]
with open("goodreads/goodreads_books.json") as f:
for line in f:
book = json.loads(line)
book_id = int(book["book_id"])
if book_id in books_read and book_id not in seq_book_map:
books.append({
"id": book_id, # original good reads bookid
"index": seq_id, # sequential id in our analysis
"title": book["title"],
"authors": [authors[author["author_id"]] for author in book["authors"] ]
})
seq_book_map[book_id] = seq_id
seq_id += 1
#
# SECOND PASS: Create list of lists. each list in the list corresponds to a user and the elements of that list are books
# they have read.
#
user_ids = set(users_books_read.keys())
book_ids = set(books_read.keys())
with open("goodreads/goodreads_interactions.csv") as f:
reader = csv.reader(f)
header = next(reader)
user_indices, user_count = {}, 0
interactions = []
for row in reader:
user_id, _id, is_read, rating, is_reviewed = [int(col) for col in row]
if user_id in user_ids and book_id_map[_id] in book_ids and is_read:
# the _id is the internal id field, map it to the book_id and then to the seq id to find the index value.
book_index = books[seq_book_map[book_id_map[_id]]]["index"]
if user_id not in user_indices:
user_indices[user_id] = user_count
user_count += 1
interactions.append([])
interactions[user_indices[user_id]].append(book_index)
#
# persist the preprocessed data
#
with open("goodreads/interactions.json", "w") as f:
json.dump({
"interactions": interactions,
"books": books
}, f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment