import json | |
import csv | |
from collections import defaultdict | |
# load the mapping (supplied by the dataset) from id into book_id | |
with open("goodreads/book_id_map.csv") as f: | |
reader = csv.reader(f) | |
_ = next(reader) | |
book_id_map = {int(_id): int(book_id) for _id, book_id in reader} | |
# | |
# FIRST PASS: read through all the interactions and count the number of books for each user | |
# and the number of times each book has been read | |
# | |
users_books_read = defaultdict(int) | |
books_read = defaultdict(int) | |
with open("goodreads/goodreads_interactions.csv") as f: | |
reader = csv.reader(f) | |
header = next(reader) | |
for row in reader: | |
user_id, _id, is_read, rating, is_reviewed = [int(col) for col in row] | |
if is_read: | |
users_books_read[user_id] += 1 | |
books_read[book_id_map[_id]] += 1 | |
print(f"Users: %s, books: %s" % (len(users_books_read), len(books_read))) | |
# expect: "Users: 876145, books: 2360650" | |
# Filter out all books read fewer than 3 times | |
books_read = {k:v for k,v in books_read.items() if v>2} | |
# Filter out users who have read less than 2 books | |
users_books_read = {k:v for k,v in users_books_read.items() if v>1} | |
# create book metadata | |
books = [] | |
seq_id = 0 | |
seq_book_map = {} | |
# load author info | |
authors = {} | |
with open("goodreads/goodreads_book_authors.json") as f: | |
for line in f: | |
author = json.loads(line) | |
author_id = author["author_id"] | |
authors[author_id] = author["name"] | |
with open("goodreads/goodreads_books.json") as f: | |
for line in f: | |
book = json.loads(line) | |
book_id = int(book["book_id"]) | |
if book_id in books_read and book_id not in seq_book_map: | |
books.append({ | |
"id": book_id, # original good reads bookid | |
"index": seq_id, # sequential id in our analysis | |
"title": book["title"], | |
"authors": [authors[author["author_id"]] for author in book["authors"] ] | |
}) | |
seq_book_map[book_id] = seq_id | |
seq_id += 1 | |
# | |
# SECOND PASS: Create list of lists. each list in the list corresponds to a user and the elements of that list are books | |
# they have read. | |
# | |
user_ids = set(users_books_read.keys()) | |
book_ids = set(books_read.keys()) | |
with open("goodreads/goodreads_interactions.csv") as f: | |
reader = csv.reader(f) | |
header = next(reader) | |
user_indices, user_count = {}, 0 | |
interactions = [] | |
for row in reader: | |
user_id, _id, is_read, rating, is_reviewed = [int(col) for col in row] | |
if user_id in user_ids and book_id_map[_id] in book_ids and is_read: | |
# the _id is the internal id field, map it to the book_id and then to the seq id to find the index value. | |
book_index = books[seq_book_map[book_id_map[_id]]]["index"] | |
if user_id not in user_indices: | |
user_indices[user_id] = user_count | |
user_count += 1 | |
interactions.append([]) | |
interactions[user_indices[user_id]].append(book_index) | |
# | |
# persist the preprocessed data | |
# | |
with open("goodreads/interactions.json", "w") as f: | |
json.dump({ | |
"interactions": interactions, | |
"books": books | |
}, f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment