adlermedrado/parser.py

## parser.py
import os
import re
from bs4 import BeautifulSoup

content_root = 'public'
total_files = 0
total_errors = 0


def convert_to_new_post(post_contents):
    info_message = '**AVISO:** _Este post é muito antigo e seu conteúdo provavelmente está defasado, ' \
                   'permanecendo no meu blog apenas por motivos históricos._\n\n'
    new_post_file = f"converted/{post_contents['post_slug']}.md"
    new_post = open(new_post_file, 'w')
    new_post.write(f"Title: {post_contents['post_title']}\n")
    new_post.write(f"Date: {post_contents['post_date']}\n")
    new_post.write("Author: adler\n")
    new_post.write("Tags: old\n")
    new_post.write(f"Slug: {post_contents['post_slug']}\n")
    new_post.write("Status: published\n\n")
    new_post.write(info_message + post_contents['post_content'])
    new_post.close()


def replace_invalid_chars(content):
    rep_dict = {
        'Âº': 'º',
        'a%c2%ba': '',
        'Ã£': 'ã',
        'Ã': 'í',
        'Ã¡': 'á',
        'Ãª': 'ê',
        'Ã©': 'é',
        'Ã§': 'ç',
        'Ã³': 'ó',
        'Ãµ': 'õ',
        'í¡': 'á',
        'í s': 'às',
        'í©': 'é',
        'í§': 'ç',
        'í³': 'ó',
        'í ': 'à',
        'íª': 'ê',
        'a%c2%a7ao': 'cao',
        '├®': 'é',
        'j├àtem': 'já tem',
        'n├úo': 'não',
        '├í': 'á',
        'come├ºou': 'começou',
        'cansa├ºo': 'cansa├ºo',
        'efici├¬ncia': 'eficiência',
        'combina├º├úo': 'combinação',
        'op├º├úo': 'opção',
        '├º├ú': 'çã',
        '├ú': 'ã',
        '├á': 'a',
        '├│': 'ó',
        '├à': 'á ',
        '├º├Á': 'çõ,',
        '├¡': 'í',
        'sa%c2%a9rie': 'serie',
        'íº': 'ú',

    }

    for i, j in rep_dict.items():
        content = content.replace(i, j)

    return content


def extract_data():
    content = content_file.read()
    soup = BeautifulSoup(content, "html.parser")
    post_title = soup.html.body.h1.text
    post_content_data = soup.findAll('div', {'class': 'span12'})[1]
    post_date = post_content_data.find('p').text.replace('Post date:', '').strip()
    if not post_date:
        raise ValueError("Post date not found")
    post_content = post_content_data.text.strip()
    post_content = re.sub(r'Post date: (\d+/\d+/\d+)', '', post_content)
    post_content = re.sub(r'- (\d+:\d+:\d+)', '', post_content)
    post_slug = file.replace('.html', '')

    return {
        'post_title': replace_invalid_chars(post_title),
        'post_date': post_date,
        'post_content': replace_invalid_chars(post_content),
        'post_slug': replace_invalid_chars(post_slug),
    }


for subdir, dirs, files in os.walk(content_root):
    for file in files:
        if 'images' not in subdir:
            total_files = total_files + 1
            file_path = os.path.join(subdir, file)
            with open(file_path, 'r') as content_file:
                try:
                    post_data = extract_data()
                except (AttributeError, ValueError) as exc:
                    total_errors = total_errors + 1
                    error_message = f'Error processing {file_path} with error: {exc}'
                    print(error_message)

                convert_to_new_post(post_data)

print(f'Total files: {total_files}')
print(f'Total errors: {total_errors}')
	import os
	import re
	from bs4 import BeautifulSoup

	content_root = 'public'
	total_files = 0
	total_errors = 0


	def convert_to_new_post(post_contents):
	info_message = 'AVISO: _Este post é muito antigo e seu conteúdo provavelmente está defasado, ' \
	'permanecendo no meu blog apenas por motivos históricos._\n\n'
	new_post_file = f"converted/{post_contents['post_slug']}.md"
	new_post = open(new_post_file, 'w')
	new_post.write(f"Title: {post_contents['post_title']}\n")
	new_post.write(f"Date: {post_contents['post_date']}\n")
	new_post.write("Author: adler\n")
	new_post.write("Tags: old\n")
	new_post.write(f"Slug: {post_contents['post_slug']}\n")
	new_post.write("Status: published\n\n")
	new_post.write(info_message + post_contents['post_content'])
	new_post.close()


	def replace_invalid_chars(content):
	rep_dict = {
	'Âº': 'º',
	'a%c2%ba': '',
	'Ã£': 'ã',
	'Ã': 'í',
	'Ã¡': 'á',
	'Ãª': 'ê',
	'Ã©': 'é',
	'Ã§': 'ç',
	'Ã³': 'ó',
	'Ãµ': 'õ',
	'í¡': 'á',
	'í s': 'às',
	'í©': 'é',
	'í§': 'ç',
	'í³': 'ó',
	'í ': 'à',
	'íª': 'ê',
	'a%c2%a7ao': 'cao',
	'├®': 'é',
	'j├àtem': 'já tem',
	'n├úo': 'não',
	'├í': 'á',
	'come├ºou': 'começou',
	'cansa├ºo': 'cansa├ºo',
	'efici├¬ncia': 'eficiência',
	'combina├º├úo': 'combinação',
	'op├º├úo': 'opção',
	'├º├ú': 'çã',
	'├ú': 'ã',
	'├á': 'a',
	'├│': 'ó',
	'├à': 'á ',
	'├º├Á': 'çõ,',
	'├¡': 'í',
	'sa%c2%a9rie': 'serie',
	'íº': 'ú',

	}

	for i, j in rep_dict.items():
	content = content.replace(i, j)

	return content


	def extract_data():
	content = content_file.read()
	soup = BeautifulSoup(content, "html.parser")
	post_title = soup.html.body.h1.text
	post_content_data = soup.findAll('div', {'class': 'span12'})[1]
	post_date = post_content_data.find('p').text.replace('Post date:', '').strip()
	if not post_date:
	raise ValueError("Post date not found")
	post_content = post_content_data.text.strip()
	post_content = re.sub(r'Post date: (\d+/\d+/\d+)', '', post_content)
	post_content = re.sub(r'- (\d+:\d+:\d+)', '', post_content)
	post_slug = file.replace('.html', '')

	return {
	'post_title': replace_invalid_chars(post_title),
	'post_date': post_date,
	'post_content': replace_invalid_chars(post_content),
	'post_slug': replace_invalid_chars(post_slug),
	}


	for subdir, dirs, files in os.walk(content_root):
	for file in files:
	if 'images' not in subdir:
	total_files = total_files + 1
	file_path = os.path.join(subdir, file)
	with open(file_path, 'r') as content_file:
	try:
	post_data = extract_data()
	except (AttributeError, ValueError) as exc:
	total_errors = total_errors + 1
	error_message = f'Error processing {file_path} with error: {exc}'
	print(error_message)

	convert_to_new_post(post_data)

	print(f'Total files: {total_files}')
	print(f'Total errors: {total_errors}')