steveseguin/run.py

## run.py
# pip install xmltodict markdownify

import xmltodict
import markdownify
import os
import requests
from urllib.parse import urlparse
import re
import time

def format_post_metadata(entry):
    title = entry['title'].get('#text', 'Untitled')
    author = entry.get('author', {}).get('name', 'Unknown')
    published_date = entry.get('published', 'Date unknown')

    metadata = f"# {title}\n\n*Author: {author}*\n*Published on: {published_date}*\n\n---\n\n"
    return metadata

def process_comment(entry):
    # Extract comment content
    comment_content = entry.get('content', {}).get('#text', '')
    markdown_comment = markdownify.markdownify(comment_content, heading_style="ATX") if comment_content else ''

    # Extract metadata (adjust these according to your XML structure)
    commenter_name = entry.get('author', {}).get('name', 'Unknown')
    comment_date = entry.get('published', 'Date unknown')

    # Format the comment with metadata
    formatted_comment = f"**{commenter_name}** - *{comment_date}*\n\n{markdown_comment}\n\n"
    return formatted_comment

def download_image(image_url, folder_path):
    try:
        response = requests.get(image_url, timeout=10)  # 10 seconds timeout
        if response.status_code == 200:
            parsed_url = urlparse(image_url)
            image_name = os.path.basename(parsed_url.path)
            # Truncate the image name if it's too long
            max_length = 50  # max filename length, adjust as needed
            if len(image_name) > max_length:
                # Split the filename and extension
                name, ext = os.path.splitext(image_name)
                image_name = name[:max_length - len(ext)] + ext
            image_path = os.path.join(folder_path, image_name)
            with open(image_path, 'wb') as file:
                file.write(response.content)
            return image_name
    except Exception as e:
        print(f"Error downloading {image_url}: {e}")
    return None

def update_image_links(content, folder_path):
    image_urls = re.findall(r'src="(https?://\S+)"', content)
    for url in image_urls:
        downloaded_image = download_image(url, folder_path)
        if downloaded_image:
            content = content.replace(url, downloaded_image)
    return content

def is_post(entry):
    if isinstance(entry['category'], list):
        return any(cat['@term'] == 'http://schemas.google.com/blogger/2008/kind#post' for cat in entry['category'])
    else:
        return entry['category']['@term'] == 'http://schemas.google.com/blogger/2008/kind#post'

def is_comment(entry):
    if isinstance(entry['category'], list):
        return any(cat['@term'] == 'http://schemas.google.com/blogger/2008/kind#comment' for cat in entry['category'])
    else:
        return entry['category']['@term'] == 'http://schemas.google.com/blogger/2008/kind#comment'

def generate_index_file(entries, output_dir):
    index_content = "# Blog Index\n\n"
    for entry in entries:
        if is_post(entry):
            title = entry['title'].get('#text', 'Untitled')
            published_date = entry.get('published', 'Date unknown')
            folder_name = "".join(x for x in title if x.isalnum() or x in " _-").rstrip()
            post_link = f"./{folder_name}/post.md"

            index_content += f"- [{title}]({post_link}) - {published_date}\n"

    with open(os.path.join(output_dir, 'index.md'), 'w', encoding='utf-8') as index_file:
        index_file.write(index_content)

def convert_blogger_xml_to_advanced_structure(xml_file, output_dir):
    with open(xml_file, 'r', encoding='utf-8') as file:
        xml_content = file.read()

    data = xmltodict.parse(xml_content)
    entries = data['feed']['entry']

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for entry in entries:
        # Process only entries that are posts (not comments)
        if is_post(entry):
            title = entry['title'].get('#text', 'Untitled')
            folder_name = "".join(x for x in title if x.isalnum() or x in " _-").rstrip()
            folder_path = os.path.join(output_dir, folder_name)

            if not os.path.exists(folder_path):
                os.makedirs(folder_path)

            content = entry.get('content', {}).get('#text', '')
            if content:
                content = update_image_links(content, folder_path)
                markdown_content = markdownify.markdownify(content, heading_style="ATX")
                markdown_content = format_post_metadata(entry) + markdown_content

                with open(os.path.join(folder_path, 'post.md'), 'w', encoding='utf-8') as md_file:
                    md_file.write(markdown_content)

                print(f"Processed post: {title}")

    # Process comments separately and associate with posts
    for entry in entries:
        # Process only entries that are comments
        if is_comment(entry):
            # Find the associated post title and folder
            post_id = entry['thr:in-reply-to']['@ref'].split('/')[-1]
            post_title = next((e['title']['#text'] for e in entries if is_post(e) and e.get('id', '').endswith(post_id)), 'Untitled')
            post_folder_name = "".join(x for x in post_title if x.isalnum() or x in " _-").rstrip()
            post_folder_path = os.path.join(output_dir, post_folder_name)

            # Write comment to the same folder as the post
            if os.path.exists(post_folder_path):
                with open(os.path.join(post_folder_path, 'comments.md'), 'a', encoding='utf-8') as comment_file:
                    formatted_comment = process_comment(entry)
                    comment_file.write(formatted_comment)
                print(f"Processed comment for post: {post_title}")

    generate_index_file(entries, output_dir)


convert_blogger_xml_to_advanced_structure('data.xml', 'output_markdown')
	# pip install xmltodict markdownify

	import xmltodict
	import markdownify
	import os
	import requests
	from urllib.parse import urlparse
	import re
	import time

	def format_post_metadata(entry):
	title = entry['title'].get('#text', 'Untitled')
	author = entry.get('author', {}).get('name', 'Unknown')
	published_date = entry.get('published', 'Date unknown')

	metadata = f"# {title}\n\nAuthor: {author}\nPublished on: {published_date}\n\n---\n\n"
	return metadata

	def process_comment(entry):
	# Extract comment content
	comment_content = entry.get('content', {}).get('#text', '')
	markdown_comment = markdownify.markdownify(comment_content, heading_style="ATX") if comment_content else ''

	# Extract metadata (adjust these according to your XML structure)
	commenter_name = entry.get('author', {}).get('name', 'Unknown')
	comment_date = entry.get('published', 'Date unknown')

	# Format the comment with metadata
	formatted_comment = f"{commenter_name} - {comment_date}\n\n{markdown_comment}\n\n"
	return formatted_comment

	def download_image(image_url, folder_path):
	try:
	response = requests.get(image_url, timeout=10) # 10 seconds timeout
	if response.status_code == 200:
	parsed_url = urlparse(image_url)
	image_name = os.path.basename(parsed_url.path)
	# Truncate the image name if it's too long
	max_length = 50 # max filename length, adjust as needed
	if len(image_name) > max_length:
	# Split the filename and extension
	name, ext = os.path.splitext(image_name)
	image_name = name[:max_length - len(ext)] + ext
	image_path = os.path.join(folder_path, image_name)
	with open(image_path, 'wb') as file:
	file.write(response.content)
	return image_name
	except Exception as e:
	print(f"Error downloading {image_url}: {e}")
	return None

	def update_image_links(content, folder_path):
	image_urls = re.findall(r'src="(https?://\S+)"', content)
	for url in image_urls:
	downloaded_image = download_image(url, folder_path)
	if downloaded_image:
	content = content.replace(url, downloaded_image)
	return content

	def is_post(entry):
	if isinstance(entry['category'], list):
	return any(cat['@term'] == 'http://schemas.google.com/blogger/2008/kind#post' for cat in entry['category'])
	else:
	return entry['category']['@term'] == 'http://schemas.google.com/blogger/2008/kind#post'

	def is_comment(entry):
	if isinstance(entry['category'], list):
	return any(cat['@term'] == 'http://schemas.google.com/blogger/2008/kind#comment' for cat in entry['category'])
	else:
	return entry['category']['@term'] == 'http://schemas.google.com/blogger/2008/kind#comment'

	def generate_index_file(entries, output_dir):
	index_content = "# Blog Index\n\n"
	for entry in entries:
	if is_post(entry):
	title = entry['title'].get('#text', 'Untitled')
	published_date = entry.get('published', 'Date unknown')
	folder_name = "".join(x for x in title if x.isalnum() or x in " _-").rstrip()
	post_link = f"./{folder_name}/post.md"

	index_content += f"- [{title}]({post_link}) - {published_date}\n"

	with open(os.path.join(output_dir, 'index.md'), 'w', encoding='utf-8') as index_file:
	index_file.write(index_content)

	def convert_blogger_xml_to_advanced_structure(xml_file, output_dir):
	with open(xml_file, 'r', encoding='utf-8') as file:
	xml_content = file.read()

	data = xmltodict.parse(xml_content)
	entries = data['feed']['entry']

	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	for entry in entries:
	# Process only entries that are posts (not comments)
	if is_post(entry):
	title = entry['title'].get('#text', 'Untitled')
	folder_name = "".join(x for x in title if x.isalnum() or x in " _-").rstrip()
	folder_path = os.path.join(output_dir, folder_name)

	if not os.path.exists(folder_path):
	os.makedirs(folder_path)

	content = entry.get('content', {}).get('#text', '')
	if content:
	content = update_image_links(content, folder_path)
	markdown_content = markdownify.markdownify(content, heading_style="ATX")
	markdown_content = format_post_metadata(entry) + markdown_content

	with open(os.path.join(folder_path, 'post.md'), 'w', encoding='utf-8') as md_file:
	md_file.write(markdown_content)

	print(f"Processed post: {title}")

	# Process comments separately and associate with posts
	for entry in entries:
	# Process only entries that are comments
	if is_comment(entry):
	# Find the associated post title and folder
	post_id = entry['thr:in-reply-to']['@ref'].split('/')[-1]
	post_title = next((e['title']['#text'] for e in entries if is_post(e) and e.get('id', '').endswith(post_id)), 'Untitled')
	post_folder_name = "".join(x for x in post_title if x.isalnum() or x in " _-").rstrip()
	post_folder_path = os.path.join(output_dir, post_folder_name)

	# Write comment to the same folder as the post
	if os.path.exists(post_folder_path):
	with open(os.path.join(post_folder_path, 'comments.md'), 'a', encoding='utf-8') as comment_file:
	formatted_comment = process_comment(entry)
	comment_file.write(formatted_comment)
	print(f"Processed comment for post: {post_title}")

	generate_index_file(entries, output_dir)


	convert_blogger_xml_to_advanced_structure('data.xml', 'output_markdown')