Last active
November 5, 2023 17:37
-
-
Save finmoorhouse/4fd8ddb50a6b9a7d9690f049992f89c9 to your computer and use it in GitHub Desktop.
Export Goodreads reviews to markdown
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import re | |
# Define the path to your CSV file | |
csv_file_path = 'goodreads_library_export.csv' | |
# Define the path to the output file | |
output_file_path = '/path/to/output.md' | |
# Read the data from the CSV file | |
df = pd.read_csv(csv_file_path) | |
min_review_length = 100 # Define the minimum number of characters for a review | |
# Filter the DataFrame to only include rows where 'My Review' column is not empty | |
df = df[df['My Review'].notna() & (df['My Review'].str.len() > min_review_length)] | |
# Convert "Date Read" and "Date Added" to datetime within the original DataFrame | |
df.loc[:, 'Date Read'] = pd.to_datetime(df['Date Read'], errors='coerce') | |
df.loc[:, 'Date Added'] = pd.to_datetime(df['Date Added'], errors='coerce') | |
# Use .loc to avoid SettingWithCopyWarning | |
df.loc[:, 'Sort Date'] = df.apply(lambda x: x['Date Read'] if pd.notnull(x['Date Read']) else x['Date Added'], axis=1) | |
# Sort the DataFrame by the new 'Sort Date' column | |
df = df.sort_values(by='Sort Date', ascending=False) | |
# Function to replace Goodreads 'book link' codes with markdown links | |
def replace_goodreads_links(text): | |
# Define the regex pattern for Goodreads links | |
pattern = r'\[(b|a):(.+?)\|(\d+)(?:\|.*?)?\]' | |
# Define a replacement function | |
def link_replacement(match): | |
# Extract the book name and book code from the match | |
link_type = match.group(1).strip() | |
name = match.group(2).strip() | |
code = match.group(3).strip() | |
# Return the markdown link format | |
# Return the appropriate markdown link format based on the type | |
if link_type == 'b': # Book link | |
return f"[{name}](https://www.goodreads.com/book/show/{code})" | |
elif link_type == 'a': # Author link | |
return f"[{name}](https://www.goodreads.com/author/show/{code})" | |
# Replace all occurrences in the text | |
return re.sub(pattern, link_replacement, text) | |
frontmatter = """--- | |
(Your markdown frontmatter here) | |
--- | |
""" | |
preamble = """ | |
Your preamble here. | |
""" | |
# Initialize a variable to keep track of the last encountered year | |
last_year = None | |
# Open the output markdown file for writing | |
with open(output_file_path, 'w') as f: | |
f.write(frontmatter) | |
f.write(preamble) | |
# Iterate over each row in the DataFrame | |
for index, row in df.iterrows(): | |
# Extract the year from 'Sort Date' | |
current_year = row['Sort Date'].year | |
# If the year of the current book is different from the last one processed, write a new year header | |
if current_year != last_year: | |
if last_year is not None: # Add a separation between different years if not the first year | |
f.write("\n\n") | |
f.write(f"## {current_year}\n\n") | |
# Update the last year tracker | |
last_year = current_year | |
# Write the book title as an H2 header | |
f.write(f"### {row['Title']}\n\n") | |
# Show author | |
f.write(f"{row['Author']} ({int(row['Year Published'])}) • ") | |
# Show my score | |
if(row['My Rating']): | |
f.write(f"{'★' * row['My Rating'] + '☆' * (5-row['My Rating'])} • ") | |
# Add date | |
f.write(f"{row['Sort Date'].strftime('%b %y')} • ") | |
# Generate link | |
f.write(f"[Link to book ↗](https://www.goodreads.com/book/show/{row['Book Id']})") | |
f.write("\n\n") | |
# Write the review text | |
formatted_content = row['My Review'].replace("<br/>", "\n") | |
formatted_content = replace_goodreads_links(formatted_content) | |
f.write(f"{formatted_content}\n\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment