varundey/goodreads-review-collector.py

## goodreads-review-collector.py
import json
import os
import pandas
import requests
from string import Template

GOODREADS_API_KEY = ""
EBOOKS_DIRECTORY = "/Users/varun.dey/Documents/Kindle"
RATINGS_JSON_FILE = "ratings.json"
RATINGS_EXCEL_FILE = "ratings.xlsx"

API_URL = Template(f"https://www.goodreads.com/book/title.$format?author=$author&key={GOODREADS_API_KEY}&title=$title")


def parse_author_title(author, title):
    author = " ".join(author.split(", ")[::-1])
    title = title.split(".")[0].split(" - ")
    title = title[0] if len(title) == 1 else title[1]
    return author, title


def parse_ebook(ebook):
    ebook_split_string = ebook.split(" - ", 1)
    author = ebook_split_string[0]
    title = ebook_split_string[1]
    parsed_author, parsed_title = parse_author_title(author, title)
    return parsed_author, parsed_title


def fetch_details(book_author, book_title):
    fetch_url = API_URL.substitute(format="xml", author=book_author, title=book_title)
    response = requests.get(fetch_url)
    print(f"Fetching response from {fetch_url}\n")
    try:
        book_rating = response.text.split("<average_rating>", 1)[1][0:4]
        book_pages = response.text.split("<num_pages>", 1)[1].split("</num_pages>", 1)[0].split("CDATA[", 1)[1].split("]]")[0]
    except:
        book_rating, book_pages = "Result not found", "Result not found"
    return book_rating, book_pages


def create_json(book_author, book_title, book_rating, book_pages, filename, list):
    return list.append({
        "TITLE": book_title, "AUTHOR": book_author, "RATING": book_rating, "PAGES":book_pages, "GOODREADS_LINK": API_URL,
        "FILE": filename.substitute(format="json", author=author, title=title)
    })


def write_to_file_and_convert_excel(list=[]):
    rating_file_obj = open(RATINGS_JSON_FILE, "w")
    json.dump(list, rating_file_obj)
    rating_file_obj.close()
    pandas.read_json(RATINGS_JSON_FILE).to_excel(RATINGS_EXCEL_FILE)


if __name__ == "__main__":
    ebooks_list = os.listdir(EBOOKS_DIRECTORY)
    parsed_ebooks_list = []
    for ebook_string in ebooks_list:
        try:
            if not ebook_string.startswith('.'):
                print(f"============================================================================\nParsing {ebook_string}\n")
                author, title = parse_ebook(ebook_string)
                rating, pages = fetch_details(author, title)
                create_json(author, title, rating, pages, ebook_string, parsed_ebooks_list)
                print(f"Parsed {ebook_string}. {(len(parsed_ebooks_list)/len(ebooks_list))*100}% done!\n")
        except Exception as exception:
            print(f"Caught exception - {exception}")
            write_to_file_and_convert_excel(parsed_ebooks_list)

    write_to_file_and_convert_excel(parsed_ebooks_list)
	import json
	import os
	import pandas
	import requests
	from string import Template

	GOODREADS_API_KEY = ""
	EBOOKS_DIRECTORY = "/Users/varun.dey/Documents/Kindle"
	RATINGS_JSON_FILE = "ratings.json"
	RATINGS_EXCEL_FILE = "ratings.xlsx"

	API_URL = Template(f"https://www.goodreads.com/book/title.$format?author=$author&key={GOODREADS_API_KEY}&title=$title")


	def parse_author_title(author, title):
	author = " ".join(author.split(", ")[::-1])
	title = title.split(".")[0].split(" - ")
	title = title[0] if len(title) == 1 else title[1]
	return author, title


	def parse_ebook(ebook):
	ebook_split_string = ebook.split(" - ", 1)
	author = ebook_split_string[0]
	title = ebook_split_string[1]
	parsed_author, parsed_title = parse_author_title(author, title)
	return parsed_author, parsed_title


	def fetch_details(book_author, book_title):
	fetch_url = API_URL.substitute(format="xml", author=book_author, title=book_title)
	response = requests.get(fetch_url)
	print(f"Fetching response from {fetch_url}\n")
	try:
	book_rating = response.text.split("<average_rating>", 1)[1][0:4]
	book_pages = response.text.split("<num_pages>", 1)[1].split("</num_pages>", 1)[0].split("CDATA[", 1)[1].split("]]")[0]
	except:
	book_rating, book_pages = "Result not found", "Result not found"
	return book_rating, book_pages


	def create_json(book_author, book_title, book_rating, book_pages, filename, list):
	return list.append({
	"TITLE": book_title, "AUTHOR": book_author, "RATING": book_rating, "PAGES":book_pages, "GOODREADS_LINK": API_URL,
	"FILE": filename.substitute(format="json", author=author, title=title)
	})


	def write_to_file_and_convert_excel(list=[]):
	rating_file_obj = open(RATINGS_JSON_FILE, "w")
	json.dump(list, rating_file_obj)
	rating_file_obj.close()
	pandas.read_json(RATINGS_JSON_FILE).to_excel(RATINGS_EXCEL_FILE)


	if __name__ == "__main__":
	ebooks_list = os.listdir(EBOOKS_DIRECTORY)
	parsed_ebooks_list = []
	for ebook_string in ebooks_list:
	try:
	if not ebook_string.startswith('.'):
	print(f"============================================================================\nParsing {ebook_string}\n")
	author, title = parse_ebook(ebook_string)
	rating, pages = fetch_details(author, title)
	create_json(author, title, rating, pages, ebook_string, parsed_ebooks_list)
	print(f"Parsed {ebook_string}. {(len(parsed_ebooks_list)/len(ebooks_list))*100}% done!\n")
	except Exception as exception:
	print(f"Caught exception - {exception}")
	write_to_file_and_convert_excel(parsed_ebooks_list)

	write_to_file_and_convert_excel(parsed_ebooks_list)