Skip to content

Instantly share code, notes, and snippets.

@steveseguin
Created December 21, 2023 18:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save steveseguin/7455015b3dab5816e907c2d7cf16905e to your computer and use it in GitHub Desktop.
Save steveseguin/7455015b3dab5816e907c2d7cf16905e to your computer and use it in GitHub Desktop.
Blogger to Markdown (github)
# pip install xmltodict markdownify
import xmltodict
import markdownify
import os
import requests
from urllib.parse import urlparse
import re
import time
def format_post_metadata(entry):
title = entry['title'].get('#text', 'Untitled')
author = entry.get('author', {}).get('name', 'Unknown')
published_date = entry.get('published', 'Date unknown')
metadata = f"# {title}\n\n*Author: {author}*\n*Published on: {published_date}*\n\n---\n\n"
return metadata
def process_comment(entry):
# Extract comment content
comment_content = entry.get('content', {}).get('#text', '')
markdown_comment = markdownify.markdownify(comment_content, heading_style="ATX") if comment_content else ''
# Extract metadata (adjust these according to your XML structure)
commenter_name = entry.get('author', {}).get('name', 'Unknown')
comment_date = entry.get('published', 'Date unknown')
# Format the comment with metadata
formatted_comment = f"**{commenter_name}** - *{comment_date}*\n\n{markdown_comment}\n\n"
return formatted_comment
def download_image(image_url, folder_path):
try:
response = requests.get(image_url, timeout=10) # 10 seconds timeout
if response.status_code == 200:
parsed_url = urlparse(image_url)
image_name = os.path.basename(parsed_url.path)
# Truncate the image name if it's too long
max_length = 50 # max filename length, adjust as needed
if len(image_name) > max_length:
# Split the filename and extension
name, ext = os.path.splitext(image_name)
image_name = name[:max_length - len(ext)] + ext
image_path = os.path.join(folder_path, image_name)
with open(image_path, 'wb') as file:
file.write(response.content)
return image_name
except Exception as e:
print(f"Error downloading {image_url}: {e}")
return None
def update_image_links(content, folder_path):
image_urls = re.findall(r'src="(https?://\S+)"', content)
for url in image_urls:
downloaded_image = download_image(url, folder_path)
if downloaded_image:
content = content.replace(url, downloaded_image)
return content
def is_post(entry):
if isinstance(entry['category'], list):
return any(cat['@term'] == 'http://schemas.google.com/blogger/2008/kind#post' for cat in entry['category'])
else:
return entry['category']['@term'] == 'http://schemas.google.com/blogger/2008/kind#post'
def is_comment(entry):
if isinstance(entry['category'], list):
return any(cat['@term'] == 'http://schemas.google.com/blogger/2008/kind#comment' for cat in entry['category'])
else:
return entry['category']['@term'] == 'http://schemas.google.com/blogger/2008/kind#comment'
def generate_index_file(entries, output_dir):
index_content = "# Blog Index\n\n"
for entry in entries:
if is_post(entry):
title = entry['title'].get('#text', 'Untitled')
published_date = entry.get('published', 'Date unknown')
folder_name = "".join(x for x in title if x.isalnum() or x in " _-").rstrip()
post_link = f"./{folder_name}/post.md"
index_content += f"- [{title}]({post_link}) - {published_date}\n"
with open(os.path.join(output_dir, 'index.md'), 'w', encoding='utf-8') as index_file:
index_file.write(index_content)
def convert_blogger_xml_to_advanced_structure(xml_file, output_dir):
with open(xml_file, 'r', encoding='utf-8') as file:
xml_content = file.read()
data = xmltodict.parse(xml_content)
entries = data['feed']['entry']
if not os.path.exists(output_dir):
os.makedirs(output_dir)
for entry in entries:
# Process only entries that are posts (not comments)
if is_post(entry):
title = entry['title'].get('#text', 'Untitled')
folder_name = "".join(x for x in title if x.isalnum() or x in " _-").rstrip()
folder_path = os.path.join(output_dir, folder_name)
if not os.path.exists(folder_path):
os.makedirs(folder_path)
content = entry.get('content', {}).get('#text', '')
if content:
content = update_image_links(content, folder_path)
markdown_content = markdownify.markdownify(content, heading_style="ATX")
markdown_content = format_post_metadata(entry) + markdown_content
with open(os.path.join(folder_path, 'post.md'), 'w', encoding='utf-8') as md_file:
md_file.write(markdown_content)
print(f"Processed post: {title}")
# Process comments separately and associate with posts
for entry in entries:
# Process only entries that are comments
if is_comment(entry):
# Find the associated post title and folder
post_id = entry['thr:in-reply-to']['@ref'].split('/')[-1]
post_title = next((e['title']['#text'] for e in entries if is_post(e) and e.get('id', '').endswith(post_id)), 'Untitled')
post_folder_name = "".join(x for x in post_title if x.isalnum() or x in " _-").rstrip()
post_folder_path = os.path.join(output_dir, post_folder_name)
# Write comment to the same folder as the post
if os.path.exists(post_folder_path):
with open(os.path.join(post_folder_path, 'comments.md'), 'a', encoding='utf-8') as comment_file:
formatted_comment = process_comment(entry)
comment_file.write(formatted_comment)
print(f"Processed comment for post: {post_title}")
generate_index_file(entries, output_dir)
convert_blogger_xml_to_advanced_structure('data.xml', 'output_markdown')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment