Script to parse old HTML blog posts to Markdown with Pelican format.
import os | |
import re | |
from bs4 import BeautifulSoup | |
content_root = 'public' | |
total_files = 0 | |
total_errors = 0 | |
def convert_to_new_post(post_contents): | |
info_message = '**AVISO:** _Este post é muito antigo e seu conteúdo provavelmente está defasado, ' \ | |
'permanecendo no meu blog apenas por motivos históricos._\n\n' | |
new_post_file = f"converted/{post_contents['post_slug']}.md" | |
new_post = open(new_post_file, 'w') | |
new_post.write(f"Title: {post_contents['post_title']}\n") | |
new_post.write(f"Date: {post_contents['post_date']}\n") | |
new_post.write("Author: adler\n") | |
new_post.write("Tags: old\n") | |
new_post.write(f"Slug: {post_contents['post_slug']}\n") | |
new_post.write("Status: published\n\n") | |
new_post.write(info_message + post_contents['post_content']) | |
new_post.close() | |
def replace_invalid_chars(content): | |
rep_dict = { | |
'º': 'º', | |
'a%c2%ba': '', | |
'ã': 'ã', | |
'Ã': 'í', | |
'á': 'á', | |
'ê': 'ê', | |
'é': 'é', | |
'ç': 'ç', | |
'ó': 'ó', | |
'õ': 'õ', | |
'í¡': 'á', | |
'í s': 'às', | |
'í©': 'é', | |
'í§': 'ç', | |
'í³': 'ó', | |
'í ': 'à', | |
'íª': 'ê', | |
'a%c2%a7ao': 'cao', | |
'├®': 'é', | |
'j├àtem': 'já tem', | |
'n├úo': 'não', | |
'├í': 'á', | |
'come├ºou': 'começou', | |
'cansaço': 'cansaço', | |
'efici├¬ncia': 'eficiência', | |
'combina├º├úo': 'combinação', | |
'op├º├úo': 'opção', | |
'├º├ú': 'çã', | |
'├ú': 'ã', | |
'à': 'a', | |
'├│': 'ó', | |
'├à': 'á ', | |
'├º├Á': 'çõ,', | |
'├¡': 'í', | |
'sa%c2%a9rie': 'serie', | |
'íº': 'ú', | |
} | |
for i, j in rep_dict.items(): | |
content = content.replace(i, j) | |
return content | |
def extract_data(): | |
content = content_file.read() | |
soup = BeautifulSoup(content, "html.parser") | |
post_title = soup.html.body.h1.text | |
post_content_data = soup.findAll('div', {'class': 'span12'})[1] | |
post_date = post_content_data.find('p').text.replace('Post date:', '').strip() | |
if not post_date: | |
raise ValueError("Post date not found") | |
post_content = post_content_data.text.strip() | |
post_content = re.sub(r'Post date: (\d+/\d+/\d+)', '', post_content) | |
post_content = re.sub(r'- (\d+:\d+:\d+)', '', post_content) | |
post_slug = file.replace('.html', '') | |
return { | |
'post_title': replace_invalid_chars(post_title), | |
'post_date': post_date, | |
'post_content': replace_invalid_chars(post_content), | |
'post_slug': replace_invalid_chars(post_slug), | |
} | |
for subdir, dirs, files in os.walk(content_root): | |
for file in files: | |
if 'images' not in subdir: | |
total_files = total_files + 1 | |
file_path = os.path.join(subdir, file) | |
with open(file_path, 'r') as content_file: | |
try: | |
post_data = extract_data() | |
except (AttributeError, ValueError) as exc: | |
total_errors = total_errors + 1 | |
error_message = f'Error processing {file_path} with error: {exc}' | |
print(error_message) | |
convert_to_new_post(post_data) | |
print(f'Total files: {total_files}') | |
print(f'Total errors: {total_errors}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment