Script to parse old HTML blog posts to Markdown with Pelican format.
import os
import re
from bs4 import BeautifulSoup
content_root = 'public'
total_files = 0
total_errors = 0
def convert_to_new_post(post_contents):
info_message = '**AVISO:** _Este post é muito antigo e seu conteúdo provavelmente está defasado, ' \
'permanecendo no meu blog apenas por motivos históricos._\n\n'
new_post_file = f"converted/{post_contents['post_slug']}.md"
new_post = open(new_post_file, 'w')
new_post.write(f"Title: {post_contents['post_title']}\n")
new_post.write(f"Date: {post_contents['post_date']}\n")
new_post.write("Author: adler\n")
new_post.write("Tags: old\n")
new_post.write(f"Slug: {post_contents['post_slug']}\n")
new_post.write("Status: published\n\n")
new_post.write(info_message + post_contents['post_content'])
def replace_invalid_chars(content):
rep_dict = {
'º': 'º',
'a%c2%ba': '',
'ã': 'ã',
'Ã': 'í',
'á': 'á',
'ê': 'ê',
'é': 'é',
'ç': 'ç',
'ó': 'ó',
'õ': 'õ',
'í¡': 'á',
'í s': 'às',
'í©': 'é',
'í§': 'ç',
'í³': 'ó',
'í ': 'à',
'íª': 'ê',
'a%c2%a7ao': 'cao',
'├®': 'é',
'j├àtem': 'já tem',
'n├úo': 'não',
'├í': 'á',
'come├ºou': 'começou',
'cansaço': 'cansaço',
'efici├¬ncia': 'eficiência',
'combina├º├úo': 'combinação',
'op├º├úo': 'opção',
'├º├ú': 'çã',
'├ú': 'ã',
'à': 'a',
'├│': 'ó',
'├à': 'á ',
'├º├Á': 'çõ,',
'├¡': 'í',
'sa%c2%a9rie': 'serie',
'íº': 'ú',
for i, j in rep_dict.items():
content = content.replace(i, j)
return content
def extract_data():
content =
soup = BeautifulSoup(content, "html.parser")
post_title = soup.html.body.h1.text
post_content_data = soup.findAll('div', {'class': 'span12'})[1]
post_date = post_content_data.find('p').text.replace('Post date:', '').strip()
if not post_date:
raise ValueError("Post date not found")
post_content = post_content_data.text.strip()
post_content = re.sub(r'Post date: (\d+/\d+/\d+)', '', post_content)
post_content = re.sub(r'- (\d+:\d+:\d+)', '', post_content)
post_slug = file.replace('.html', '')
return {
'post_title': replace_invalid_chars(post_title),
'post_date': post_date,
'post_content': replace_invalid_chars(post_content),
'post_slug': replace_invalid_chars(post_slug),
for subdir, dirs, files in os.walk(content_root):
for file in files:
if 'images' not in subdir:
total_files = total_files + 1
file_path = os.path.join(subdir, file)
with open(file_path, 'r') as content_file:
post_data = extract_data()
except (AttributeError, ValueError) as exc:
total_errors = total_errors + 1
error_message = f'Error processing {file_path} with error: {exc}'
print(f'Total files: {total_files}')
print(f'Total errors: {total_errors}')
