Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Script to parse old HTML blog posts to Markdown with Pelican format.
import os
import re
from bs4 import BeautifulSoup
content_root = 'public'
total_files = 0
total_errors = 0
def convert_to_new_post(post_contents):
info_message = '**AVISO:** _Este post é muito antigo e seu conteúdo provavelmente está defasado, ' \
'permanecendo no meu blog apenas por motivos históricos._\n\n'
new_post_file = f"converted/{post_contents['post_slug']}.md"
new_post = open(new_post_file, 'w')
new_post.write(f"Title: {post_contents['post_title']}\n")
new_post.write(f"Date: {post_contents['post_date']}\n")
new_post.write("Author: adler\n")
new_post.write("Tags: old\n")
new_post.write(f"Slug: {post_contents['post_slug']}\n")
new_post.write("Status: published\n\n")
new_post.write(info_message + post_contents['post_content'])
new_post.close()
def replace_invalid_chars(content):
rep_dict = {
'º': 'º',
'a%c2%ba': '',
'ã': 'ã',
'Ã': 'í',
'á': 'á',
'ê': 'ê',
'é': 'é',
'ç': 'ç',
'ó': 'ó',
'õ': 'õ',
'í¡': 'á',
'í s': 'às',
'í©': 'é',
'í§': 'ç',
'í³': 'ó',
'í ': 'à',
'íª': 'ê',
'a%c2%a7ao': 'cao',
'├®': 'é',
'j├àtem': 'já tem',
'n├úo': 'não',
'├í': 'á',
'come├ºou': 'começou',
'cansaço': 'cansaço',
'efici├¬ncia': 'eficiência',
'combina├º├úo': 'combinação',
'op├º├úo': 'opção',
'├º├ú': 'çã',
'├ú': 'ã',
'à': 'a',
'├│': 'ó',
'├à': 'á ',
'├º├Á': 'çõ,',
'├¡': 'í',
'sa%c2%a9rie': 'serie',
'íº': 'ú',
}
for i, j in rep_dict.items():
content = content.replace(i, j)
return content
def extract_data():
content = content_file.read()
soup = BeautifulSoup(content, "html.parser")
post_title = soup.html.body.h1.text
post_content_data = soup.findAll('div', {'class': 'span12'})[1]
post_date = post_content_data.find('p').text.replace('Post date:', '').strip()
if not post_date:
raise ValueError("Post date not found")
post_content = post_content_data.text.strip()
post_content = re.sub(r'Post date: (\d+/\d+/\d+)', '', post_content)
post_content = re.sub(r'- (\d+:\d+:\d+)', '', post_content)
post_slug = file.replace('.html', '')
return {
'post_title': replace_invalid_chars(post_title),
'post_date': post_date,
'post_content': replace_invalid_chars(post_content),
'post_slug': replace_invalid_chars(post_slug),
}
for subdir, dirs, files in os.walk(content_root):
for file in files:
if 'images' not in subdir:
total_files = total_files + 1
file_path = os.path.join(subdir, file)
with open(file_path, 'r') as content_file:
try:
post_data = extract_data()
except (AttributeError, ValueError) as exc:
total_errors = total_errors + 1
error_message = f'Error processing {file_path} with error: {exc}'
print(error_message)
convert_to_new_post(post_data)
print(f'Total files: {total_files}')
print(f'Total errors: {total_errors}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.