rwalk/goodreads_data.py

## goodreads_data.py
import json
import csv
from collections import defaultdict

# load the mapping (supplied by the dataset) from id into book_id
with open("goodreads/book_id_map.csv") as f:
    reader = csv.reader(f)
    _ = next(reader)
    book_id_map = {int(_id): int(book_id) for _id, book_id in reader}

#
# FIRST PASS: read through all the interactions and count the number of books for each user
# and the number of times each book has been read
#
users_books_read = defaultdict(int)
books_read = defaultdict(int)

with open("goodreads/goodreads_interactions.csv") as f:
    reader = csv.reader(f)
    header = next(reader)
    for row in reader:
        user_id, _id, is_read, rating, is_reviewed = [int(col) for col in row]
        if is_read:
            users_books_read[user_id] += 1
            books_read[book_id_map[_id]] += 1

print(f"Users: %s, books: %s" % (len(users_books_read), len(books_read)))
# expect: "Users: 876145, books: 2360650"

# Filter out all books read fewer than 3 times
books_read = {k:v for k,v in books_read.items() if v>2}

# Filter out users who have read less than 2 books
users_books_read = {k:v for k,v in users_books_read.items() if v>1}

# create book metadata
books = []
seq_id = 0
seq_book_map = {}


# load author info
authors = {}
with open("goodreads/goodreads_book_authors.json") as f:
    for line in f:
        author = json.loads(line)
        author_id = author["author_id"]
        authors[author_id] = author["name"]

with open("goodreads/goodreads_books.json") as f:
    for line in f:
        book = json.loads(line)
        book_id = int(book["book_id"])
        if book_id in books_read and book_id not in seq_book_map:
            books.append({
                "id": book_id,  # original good reads bookid
                "index": seq_id,  # sequential id in our analysis
                "title": book["title"],
                "authors": [authors[author["author_id"]] for author in book["authors"] ]
            })
            seq_book_map[book_id] = seq_id
            seq_id += 1

#
# SECOND PASS: Create list of lists. each list in the list corresponds to a user and the elements of that list are books
# they have read.
#
user_ids = set(users_books_read.keys())
book_ids = set(books_read.keys())

with open("goodreads/goodreads_interactions.csv") as f:
    reader = csv.reader(f)
    header = next(reader)
    user_indices, user_count = {}, 0
    interactions = []

    for row in reader:
        user_id, _id, is_read, rating, is_reviewed = [int(col) for col in row]
        if user_id in user_ids and book_id_map[_id] in book_ids and is_read:
            # the _id is the internal id field, map it to the book_id and then to the seq id to find the index value.
            book_index = books[seq_book_map[book_id_map[_id]]]["index"]
            if user_id not in user_indices:
                user_indices[user_id] = user_count
                user_count += 1
                interactions.append([])
            interactions[user_indices[user_id]].append(book_index)

#
# persist the preprocessed data
#
with open("goodreads/interactions.json", "w") as f:
    json.dump({
        "interactions": interactions,
        "books": books
    }, f)
	import json
	import csv
	from collections import defaultdict

	# load the mapping (supplied by the dataset) from id into book_id
	with open("goodreads/book_id_map.csv") as f:
	reader = csv.reader(f)
	_ = next(reader)
	book_id_map = {int(_id): int(book_id) for _id, book_id in reader}

	#
	# FIRST PASS: read through all the interactions and count the number of books for each user
	# and the number of times each book has been read
	#
	users_books_read = defaultdict(int)
	books_read = defaultdict(int)

	with open("goodreads/goodreads_interactions.csv") as f:
	reader = csv.reader(f)
	header = next(reader)
	for row in reader:
	user_id, _id, is_read, rating, is_reviewed = [int(col) for col in row]
	if is_read:
	users_books_read[user_id] += 1
	books_read[book_id_map[_id]] += 1

	print(f"Users: %s, books: %s" % (len(users_books_read), len(books_read)))
	# expect: "Users: 876145, books: 2360650"

	# Filter out all books read fewer than 3 times
	books_read = {k:v for k,v in books_read.items() if v>2}

	# Filter out users who have read less than 2 books
	users_books_read = {k:v for k,v in users_books_read.items() if v>1}

	# create book metadata
	books = []
	seq_id = 0
	seq_book_map = {}


	# load author info
	authors = {}
	with open("goodreads/goodreads_book_authors.json") as f:
	for line in f:
	author = json.loads(line)
	author_id = author["author_id"]
	authors[author_id] = author["name"]

	with open("goodreads/goodreads_books.json") as f:
	for line in f:
	book = json.loads(line)
	book_id = int(book["book_id"])
	if book_id in books_read and book_id not in seq_book_map:
	books.append({
	"id": book_id, # original good reads bookid
	"index": seq_id, # sequential id in our analysis
	"title": book["title"],
	"authors": [authors[author["author_id"]] for author in book["authors"] ]
	})
	seq_book_map[book_id] = seq_id
	seq_id += 1

	#
	# SECOND PASS: Create list of lists. each list in the list corresponds to a user and the elements of that list are books
	# they have read.
	#
	user_ids = set(users_books_read.keys())
	book_ids = set(books_read.keys())

	with open("goodreads/goodreads_interactions.csv") as f:
	reader = csv.reader(f)
	header = next(reader)
	user_indices, user_count = {}, 0
	interactions = []

	for row in reader:
	user_id, _id, is_read, rating, is_reviewed = [int(col) for col in row]
	if user_id in user_ids and book_id_map[_id] in book_ids and is_read:
	# the _id is the internal id field, map it to the book_id and then to the seq id to find the index value.
	book_index = books[seq_book_map[book_id_map[_id]]]["index"]
	if user_id not in user_indices:
	user_indices[user_id] = user_count
	user_count += 1
	interactions.append([])
	interactions[user_indices[user_id]].append(book_index)

	#
	# persist the preprocessed data
	#
	with open("goodreads/interactions.json", "w") as f:
	json.dump({
	"interactions": interactions,
	"books": books
	}, f)