Skip to content

Instantly share code, notes, and snippets.

@chunhualiao
Last active March 13, 2024 05:28
Show Gist options
  • Save chunhualiao/97731d03ca7216b52f089642b98cf31a to your computer and use it in GitHub Desktop.
Save chunhualiao/97731d03ca7216b52f089642b98cf31a to your computer and use it in GitHub Desktop.
GPT-4 generated python program to convert google keep Takeout folder into markdown files

First download and unzip your google keep notes via a takeout package

save the following code into convert.py

  • pip install beautifulsoup4 markdownify
  • python3 convert.py
  • after it is done: mv images folder into markdown folder
  • using Obsidian to open the folder
import os
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from shutil import copyfile

# Define the directories
takeout_dir = 'Keep'
#images_dir = 'markdown/images'
images_dir = 'images'
markdown_dir = 'markdown'

# Create directories if they don't exist
if not os.path.exists(images_dir):
    os.makedirs(images_dir)
if not os.path.exists(markdown_dir):
    os.makedirs(markdown_dir)

def process_html_file(html_file_path):
    # Open files with UTF-8 encoding to support Chinese characters
    with open(html_file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    soup = BeautifulSoup(content, 'html.parser')
    
    # Focus on the main content area, adjust the selector as needed
    main_content = soup.find(lambda tag: tag.name in ['div', 'body'])

    if main_content:
        markdown_content = md(str(main_content), bullets='-')

        for img_tag in main_content.find_all('img'):
            img_src = img_tag.get('src') or img_tag.get('data-src')
            if img_src:
                img_name = os.path.basename(img_src)
                # Ensure the image name is treated as a proper Unicode string
                img_path = os.path.join(images_dir, img_name)
                source_img_path = os.path.join(os.path.dirname(html_file_path), img_src)
                if os.path.exists(source_img_path):
                    copyfile(source_img_path, img_path)
                else:
                    print(f"Image not found: {source_img_path}")
                markdown_content = markdown_content.replace(img_src, os.path.join('..', img_path))
        
        return markdown_content
    else:
        return ""

# Process each HTML file in the Keep folder
for root, dirs, files in os.walk(takeout_dir):
    for file in files:
        if file.endswith('.html'):
            # Process files and folders with Chinese names correctly
            print(f'Processing {file}...')
            markdown_content = process_html_file(os.path.join(root, file))
            if markdown_content.strip():
                markdown_file_path = os.path.join(markdown_dir, os.path.splitext(file)[0] + '.md')
                with open(markdown_file_path, 'w', encoding='utf-8') as md_file:
                    md_file.write(markdown_content)
            else:
                print(f"No visible content found in {file}, skipping.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment