Skip to content

Instantly share code, notes, and snippets.

@adlermedrado
Created December 31, 2017 19:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adlermedrado/6159cbf6b3fcca2473c175816f7ff94f to your computer and use it in GitHub Desktop.
Save adlermedrado/6159cbf6b3fcca2473c175816f7ff94f to your computer and use it in GitHub Desktop.
Script to parse old HTML blog posts to Markdown with Pelican format.
import os
import re
from bs4 import BeautifulSoup
content_root = 'public'
total_files = 0
total_errors = 0
def convert_to_new_post(post_contents):
info_message = '**AVISO:** _Este post é muito antigo e seu conteúdo provavelmente está defasado, ' \
'permanecendo no meu blog apenas por motivos históricos._\n\n'
new_post_file = f"converted/{post_contents['post_slug']}.md"
new_post = open(new_post_file, 'w')
new_post.write(f"Title: {post_contents['post_title']}\n")
new_post.write(f"Date: {post_contents['post_date']}\n")
new_post.write("Author: adler\n")
new_post.write("Tags: old\n")
new_post.write(f"Slug: {post_contents['post_slug']}\n")
new_post.write("Status: published\n\n")
new_post.write(info_message + post_contents['post_content'])
new_post.close()
def replace_invalid_chars(content):
rep_dict = {
'º': 'º',
'a%c2%ba': '',
'ã': 'ã',
'Ã': 'í',
'á': 'á',
'ê': 'ê',
'é': 'é',
'ç': 'ç',
'ó': 'ó',
'õ': 'õ',
'í¡': 'á',
'í s': 'às',
'í©': 'é',
'í§': 'ç',
'í³': 'ó',
'í ': 'à',
'íª': 'ê',
'a%c2%a7ao': 'cao',
'├®': 'é',
'j├àtem': 'já tem',
'n├úo': 'não',
'├í': 'á',
'come├ºou': 'começou',
'cansaço': 'cansaço',
'efici├¬ncia': 'eficiência',
'combina├º├úo': 'combinação',
'op├º├úo': 'opção',
'├º├ú': 'çã',
'├ú': 'ã',
'à': 'a',
'├│': 'ó',
'├à': 'á ',
'├º├Á': 'çõ,',
'├¡': 'í',
'sa%c2%a9rie': 'serie',
'íº': 'ú',
}
for i, j in rep_dict.items():
content = content.replace(i, j)
return content
def extract_data():
content = content_file.read()
soup = BeautifulSoup(content, "html.parser")
post_title = soup.html.body.h1.text
post_content_data = soup.findAll('div', {'class': 'span12'})[1]
post_date = post_content_data.find('p').text.replace('Post date:', '').strip()
if not post_date:
raise ValueError("Post date not found")
post_content = post_content_data.text.strip()
post_content = re.sub(r'Post date: (\d+/\d+/\d+)', '', post_content)
post_content = re.sub(r'- (\d+:\d+:\d+)', '', post_content)
post_slug = file.replace('.html', '')
return {
'post_title': replace_invalid_chars(post_title),
'post_date': post_date,
'post_content': replace_invalid_chars(post_content),
'post_slug': replace_invalid_chars(post_slug),
}
for subdir, dirs, files in os.walk(content_root):
for file in files:
if 'images' not in subdir:
total_files = total_files + 1
file_path = os.path.join(subdir, file)
with open(file_path, 'r') as content_file:
try:
post_data = extract_data()
except (AttributeError, ValueError) as exc:
total_errors = total_errors + 1
error_message = f'Error processing {file_path} with error: {exc}'
print(error_message)
convert_to_new_post(post_data)
print(f'Total files: {total_files}')
print(f'Total errors: {total_errors}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment