Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Convert baidu blog to hugo pages.
# coding: utf-8
from datetime import datetime
import re
import hashlib
from pathlib import Path
from bs4 import BeautifulSoup
import html2text
import frontmatter
from pytz import timezone
def read_blog_index(blog_index_path):
with open(Path(blog_index_path), encoding='utf-8') as f:
bs = BeautifulSoup(f, "html5lib")
links = bs.find_all('a')
for link in links:
href = link.attrs['href']
blog_path = Path(Path(blog_index_path).parent, href)
print(blog_path)
link_string = link.string
time_token = link_string[:18]
release_time = datetime.strptime(time_token, "%Y年%m月%d日 %H时%M分")
shanghai = timezone('Asia/Shanghai')
release_time = shanghai.localize(release_time)
title_token = link_string[22:]
title = title_token.strip()
title = re.sub(r'\s+', ' ', title).strip()
_, content = read_blog_page(blog_path)
header = get_front_matter(
title=title,
date=release_time.strftime('%Y-%m-%dT%H:%M:%S%z')
)
print(header)
output_file_path = Path('./dist/output', "{time}-{title}.md".format(
time=release_time.strftime("%Y-%m-%d"),
title=title
))
print(output_file_path)
try:
write_post(output_file_path, header, content)
except OSError:
output_file_path = Path('./dist/output', "{time}-{title}.md".format(
time=release_time.strftime("%Y-%m-%d"),
title=hashlib.sha224(title.encode('utf-8')).hexdigest()
))
write_post(output_file_path, header, content)
def read_blog_page(blog_page_path):
with open(Path(blog_page_path), encoding='utf-8') as f:
bs = BeautifulSoup(f, "html5lib")
title = bs.title.string
with open(Path(blog_page_path), encoding='utf-8') as f:
page_html = f.read()
text = html2text.html2text(page_html)
return title, text
def get_front_matter(**kwargs):
header = frontmatter.Post(content='')
for key in kwargs:
header[key] = kwargs[key]
return frontmatter.dumps(header)
def write_post(path, header, content):
with open(path, 'w', encoding='utf-8') as f:
f.write(header)
f.write('\n')
f.write(content)
if __name__ == "__main__":
read_blog_index("some path")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.