finmoorhouse/goodreads-export-to-md.py

## goodreads-export-to-md.py
import pandas as pd
import re

# Define the path to your CSV file
csv_file_path = 'goodreads_library_export.csv'

# Define the path to the output file
output_file_path = '/path/to/output.md'

# Read the data from the CSV file
df = pd.read_csv(csv_file_path)

min_review_length = 100  # Define the minimum number of characters for a review

# Filter the DataFrame to only include rows where 'My Review' column is not empty
df = df[df['My Review'].notna() & (df['My Review'].str.len() > min_review_length)]

# Convert "Date Read" and "Date Added" to datetime within the original DataFrame
df.loc[:, 'Date Read'] = pd.to_datetime(df['Date Read'], errors='coerce')
df.loc[:, 'Date Added'] = pd.to_datetime(df['Date Added'], errors='coerce')

# Use .loc to avoid SettingWithCopyWarning
df.loc[:, 'Sort Date'] = df.apply(lambda x: x['Date Read'] if pd.notnull(x['Date Read']) else x['Date Added'], axis=1)

# Sort the DataFrame by the new 'Sort Date' column
df = df.sort_values(by='Sort Date', ascending=False)

# Function to replace Goodreads 'book link' codes with markdown links
def replace_goodreads_links(text):
    # Define the regex pattern for Goodreads links
    pattern = r'\[(b|a):(.+?)\|(\d+)(?:\|.*?)?\]'

    # Define a replacement function
    def link_replacement(match):
        # Extract the book name and book code from the match
        link_type = match.group(1).strip()
        name = match.group(2).strip()
        code = match.group(3).strip()
        # Return the markdown link format
    # Return the appropriate markdown link format based on the type
        if link_type == 'b':  # Book link
            return f"[{name}](https://www.goodreads.com/book/show/{code})"
        elif link_type == 'a':  # Author link
            return f"[{name}](https://www.goodreads.com/author/show/{code})"

    # Replace all occurrences in the text
    return re.sub(pattern, link_replacement, text)

frontmatter = """---
(Your markdown frontmatter here)
---
"""
preamble = """
Your preamble here.
"""

# Initialize a variable to keep track of the last encountered year
last_year = None

# Open the output markdown file for writing
with open(output_file_path, 'w') as f:
    f.write(frontmatter)
    f.write(preamble)
    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # Extract the year from 'Sort Date'
        current_year = row['Sort Date'].year

        # If the year of the current book is different from the last one processed, write a new year header
        if current_year != last_year:
            if last_year is not None:  # Add a separation between different years if not the first year
                f.write("\n\n")
            f.write(f"## {current_year}\n\n")
            # Update the last year tracker
            last_year = current_year
        # Write the book title as an H2 header
        f.write(f"### {row['Title']}\n\n")
        # Show author
        f.write(f"{row['Author']} ({int(row['Year Published'])}) • ")
        # Show my score
        if(row['My Rating']):
            f.write(f"{'&#9733;' * row['My Rating'] + '&#9734;' * (5-row['My Rating'])} • ")
        # Add date
        f.write(f"{row['Sort Date'].strftime('%b %y')} • ")
        # Generate link
        f.write(f"[Link to book ↗](https://www.goodreads.com/book/show/{row['Book Id']})")
        f.write("\n\n")
        # Write the review text
        formatted_content = row['My Review'].replace("<br/>", "\n")
        formatted_content = replace_goodreads_links(formatted_content)
        f.write(f"{formatted_content}\n\n")
	import pandas as pd
	import re

	# Define the path to your CSV file
	csv_file_path = 'goodreads_library_export.csv'

	# Define the path to the output file
	output_file_path = '/path/to/output.md'

	# Read the data from the CSV file
	df = pd.read_csv(csv_file_path)

	min_review_length = 100 # Define the minimum number of characters for a review

	# Filter the DataFrame to only include rows where 'My Review' column is not empty
	df = df[df['My Review'].notna() & (df['My Review'].str.len() > min_review_length)]

	# Convert "Date Read" and "Date Added" to datetime within the original DataFrame
	df.loc[:, 'Date Read'] = pd.to_datetime(df['Date Read'], errors='coerce')
	df.loc[:, 'Date Added'] = pd.to_datetime(df['Date Added'], errors='coerce')

	# Use .loc to avoid SettingWithCopyWarning
	df.loc[:, 'Sort Date'] = df.apply(lambda x: x['Date Read'] if pd.notnull(x['Date Read']) else x['Date Added'], axis=1)

	# Sort the DataFrame by the new 'Sort Date' column
	df = df.sort_values(by='Sort Date', ascending=False)

	# Function to replace Goodreads 'book link' codes with markdown links
	def replace_goodreads_links(text):
	# Define the regex pattern for Goodreads links
	pattern = r'\[(b\|a):(.+?)\\|(\d+)(?:\\|.*?)?\]'

	# Define a replacement function
	def link_replacement(match):
	# Extract the book name and book code from the match
	link_type = match.group(1).strip()
	name = match.group(2).strip()
	code = match.group(3).strip()
	# Return the markdown link format
	# Return the appropriate markdown link format based on the type
	if link_type == 'b': # Book link
	return f"[{name}](https://www.goodreads.com/book/show/{code})"
	elif link_type == 'a': # Author link
	return f"[{name}](https://www.goodreads.com/author/show/{code})"

	# Replace all occurrences in the text
	return re.sub(pattern, link_replacement, text)

	frontmatter = """---
	(Your markdown frontmatter here)
	---
	"""
	preamble = """
	Your preamble here.
	"""

	# Initialize a variable to keep track of the last encountered year
	last_year = None

	# Open the output markdown file for writing
	with open(output_file_path, 'w') as f:
	f.write(frontmatter)
	f.write(preamble)
	# Iterate over each row in the DataFrame
	for index, row in df.iterrows():
	# Extract the year from 'Sort Date'
	current_year = row['Sort Date'].year

	# If the year of the current book is different from the last one processed, write a new year header
	if current_year != last_year:
	if last_year is not None: # Add a separation between different years if not the first year
	f.write("\n\n")
	f.write(f"## {current_year}\n\n")
	# Update the last year tracker
	last_year = current_year
	# Write the book title as an H2 header
	f.write(f"### {row['Title']}\n\n")
	# Show author
	f.write(f"{row['Author']} ({int(row['Year Published'])}) • ")
	# Show my score
	if(row['My Rating']):
	f.write(f"{'★' * row['My Rating'] + '☆' * (5-row['My Rating'])} • ")
	# Add date
	f.write(f"{row['Sort Date'].strftime('%b %y')} • ")
	# Generate link
	f.write(f"[Link to book ↗](https://www.goodreads.com/book/show/{row['Book Id']})")
	f.write("\n\n")
	# Write the review text
	formatted_content = row['My Review'].replace("<br/>", "\n")
	formatted_content = replace_goodreads_links(formatted_content)
	f.write(f"{formatted_content}\n\n")