Created
December 21, 2023 18:25
-
-
Save steveseguin/7455015b3dab5816e907c2d7cf16905e to your computer and use it in GitHub Desktop.
Blogger to Markdown (github)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install xmltodict markdownify | |
import xmltodict | |
import markdownify | |
import os | |
import requests | |
from urllib.parse import urlparse | |
import re | |
import time | |
def format_post_metadata(entry): | |
title = entry['title'].get('#text', 'Untitled') | |
author = entry.get('author', {}).get('name', 'Unknown') | |
published_date = entry.get('published', 'Date unknown') | |
metadata = f"# {title}\n\n*Author: {author}*\n*Published on: {published_date}*\n\n---\n\n" | |
return metadata | |
def process_comment(entry): | |
# Extract comment content | |
comment_content = entry.get('content', {}).get('#text', '') | |
markdown_comment = markdownify.markdownify(comment_content, heading_style="ATX") if comment_content else '' | |
# Extract metadata (adjust these according to your XML structure) | |
commenter_name = entry.get('author', {}).get('name', 'Unknown') | |
comment_date = entry.get('published', 'Date unknown') | |
# Format the comment with metadata | |
formatted_comment = f"**{commenter_name}** - *{comment_date}*\n\n{markdown_comment}\n\n" | |
return formatted_comment | |
def download_image(image_url, folder_path): | |
try: | |
response = requests.get(image_url, timeout=10) # 10 seconds timeout | |
if response.status_code == 200: | |
parsed_url = urlparse(image_url) | |
image_name = os.path.basename(parsed_url.path) | |
# Truncate the image name if it's too long | |
max_length = 50 # max filename length, adjust as needed | |
if len(image_name) > max_length: | |
# Split the filename and extension | |
name, ext = os.path.splitext(image_name) | |
image_name = name[:max_length - len(ext)] + ext | |
image_path = os.path.join(folder_path, image_name) | |
with open(image_path, 'wb') as file: | |
file.write(response.content) | |
return image_name | |
except Exception as e: | |
print(f"Error downloading {image_url}: {e}") | |
return None | |
def update_image_links(content, folder_path): | |
image_urls = re.findall(r'src="(https?://\S+)"', content) | |
for url in image_urls: | |
downloaded_image = download_image(url, folder_path) | |
if downloaded_image: | |
content = content.replace(url, downloaded_image) | |
return content | |
def is_post(entry): | |
if isinstance(entry['category'], list): | |
return any(cat['@term'] == 'http://schemas.google.com/blogger/2008/kind#post' for cat in entry['category']) | |
else: | |
return entry['category']['@term'] == 'http://schemas.google.com/blogger/2008/kind#post' | |
def is_comment(entry): | |
if isinstance(entry['category'], list): | |
return any(cat['@term'] == 'http://schemas.google.com/blogger/2008/kind#comment' for cat in entry['category']) | |
else: | |
return entry['category']['@term'] == 'http://schemas.google.com/blogger/2008/kind#comment' | |
def generate_index_file(entries, output_dir): | |
index_content = "# Blog Index\n\n" | |
for entry in entries: | |
if is_post(entry): | |
title = entry['title'].get('#text', 'Untitled') | |
published_date = entry.get('published', 'Date unknown') | |
folder_name = "".join(x for x in title if x.isalnum() or x in " _-").rstrip() | |
post_link = f"./{folder_name}/post.md" | |
index_content += f"- [{title}]({post_link}) - {published_date}\n" | |
with open(os.path.join(output_dir, 'index.md'), 'w', encoding='utf-8') as index_file: | |
index_file.write(index_content) | |
def convert_blogger_xml_to_advanced_structure(xml_file, output_dir): | |
with open(xml_file, 'r', encoding='utf-8') as file: | |
xml_content = file.read() | |
data = xmltodict.parse(xml_content) | |
entries = data['feed']['entry'] | |
if not os.path.exists(output_dir): | |
os.makedirs(output_dir) | |
for entry in entries: | |
# Process only entries that are posts (not comments) | |
if is_post(entry): | |
title = entry['title'].get('#text', 'Untitled') | |
folder_name = "".join(x for x in title if x.isalnum() or x in " _-").rstrip() | |
folder_path = os.path.join(output_dir, folder_name) | |
if not os.path.exists(folder_path): | |
os.makedirs(folder_path) | |
content = entry.get('content', {}).get('#text', '') | |
if content: | |
content = update_image_links(content, folder_path) | |
markdown_content = markdownify.markdownify(content, heading_style="ATX") | |
markdown_content = format_post_metadata(entry) + markdown_content | |
with open(os.path.join(folder_path, 'post.md'), 'w', encoding='utf-8') as md_file: | |
md_file.write(markdown_content) | |
print(f"Processed post: {title}") | |
# Process comments separately and associate with posts | |
for entry in entries: | |
# Process only entries that are comments | |
if is_comment(entry): | |
# Find the associated post title and folder | |
post_id = entry['thr:in-reply-to']['@ref'].split('/')[-1] | |
post_title = next((e['title']['#text'] for e in entries if is_post(e) and e.get('id', '').endswith(post_id)), 'Untitled') | |
post_folder_name = "".join(x for x in post_title if x.isalnum() or x in " _-").rstrip() | |
post_folder_path = os.path.join(output_dir, post_folder_name) | |
# Write comment to the same folder as the post | |
if os.path.exists(post_folder_path): | |
with open(os.path.join(post_folder_path, 'comments.md'), 'a', encoding='utf-8') as comment_file: | |
formatted_comment = process_comment(entry) | |
comment_file.write(formatted_comment) | |
print(f"Processed comment for post: {post_title}") | |
generate_index_file(entries, output_dir) | |
convert_blogger_xml_to_advanced_structure('data.xml', 'output_markdown') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment