Created
June 2, 2020 04:59
-
-
Save rwalk/b562c3d66628b516e9b184bf4f0f9df0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import csv | |
from collections import defaultdict | |
# load the mapping (supplied by the dataset) from id into book_id | |
with open("goodreads/book_id_map.csv") as f: | |
reader = csv.reader(f) | |
_ = next(reader) | |
book_id_map = {int(_id): int(book_id) for _id, book_id in reader} | |
# | |
# FIRST PASS: read through all the interactions and count the number of books for each user | |
# and the number of times each book has been read | |
# | |
users_books_read = defaultdict(int) | |
books_read = defaultdict(int) | |
with open("goodreads/goodreads_interactions.csv") as f: | |
reader = csv.reader(f) | |
header = next(reader) | |
for row in reader: | |
user_id, _id, is_read, rating, is_reviewed = [int(col) for col in row] | |
if is_read: | |
users_books_read[user_id] += 1 | |
books_read[book_id_map[_id]] += 1 | |
print(f"Users: %s, books: %s" % (len(users_books_read), len(books_read))) | |
# expect: "Users: 876145, books: 2360650" | |
# Filter out all books read fewer than 3 times | |
books_read = {k:v for k,v in books_read.items() if v>2} | |
# Filter out users who have read less than 2 books | |
users_books_read = {k:v for k,v in users_books_read.items() if v>1} | |
# create book metadata | |
books = [] | |
seq_id = 0 | |
seq_book_map = {} | |
# load author info | |
authors = {} | |
with open("goodreads/goodreads_book_authors.json") as f: | |
for line in f: | |
author = json.loads(line) | |
author_id = author["author_id"] | |
authors[author_id] = author["name"] | |
with open("goodreads/goodreads_books.json") as f: | |
for line in f: | |
book = json.loads(line) | |
book_id = int(book["book_id"]) | |
if book_id in books_read and book_id not in seq_book_map: | |
books.append({ | |
"id": book_id, # original good reads bookid | |
"index": seq_id, # sequential id in our analysis | |
"title": book["title"], | |
"authors": [authors[author["author_id"]] for author in book["authors"] ] | |
}) | |
seq_book_map[book_id] = seq_id | |
seq_id += 1 | |
# | |
# SECOND PASS: Create list of lists. each list in the list corresponds to a user and the elements of that list are books | |
# they have read. | |
# | |
user_ids = set(users_books_read.keys()) | |
book_ids = set(books_read.keys()) | |
with open("goodreads/goodreads_interactions.csv") as f: | |
reader = csv.reader(f) | |
header = next(reader) | |
user_indices, user_count = {}, 0 | |
interactions = [] | |
for row in reader: | |
user_id, _id, is_read, rating, is_reviewed = [int(col) for col in row] | |
if user_id in user_ids and book_id_map[_id] in book_ids and is_read: | |
# the _id is the internal id field, map it to the book_id and then to the seq id to find the index value. | |
book_index = books[seq_book_map[book_id_map[_id]]]["index"] | |
if user_id not in user_indices: | |
user_indices[user_id] = user_count | |
user_count += 1 | |
interactions.append([]) | |
interactions[user_indices[user_id]].append(book_index) | |
# | |
# persist the preprocessed data | |
# | |
with open("goodreads/interactions.json", "w") as f: | |
json.dump({ | |
"interactions": interactions, | |
"books": books | |
}, f) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment