Created
May 4, 2019 12:18
-
-
Save perillaroc/2e3e21e79249da0f81a88420c9519730 to your computer and use it in GitHub Desktop.
Convert baidu blog to hugo pages.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # coding: utf-8 | |
| from datetime import datetime | |
| import re | |
| import hashlib | |
| from pathlib import Path | |
| from bs4 import BeautifulSoup | |
| import html2text | |
| import frontmatter | |
| from pytz import timezone | |
| def read_blog_index(blog_index_path): | |
| with open(Path(blog_index_path), encoding='utf-8') as f: | |
| bs = BeautifulSoup(f, "html5lib") | |
| links = bs.find_all('a') | |
| for link in links: | |
| href = link.attrs['href'] | |
| blog_path = Path(Path(blog_index_path).parent, href) | |
| print(blog_path) | |
| link_string = link.string | |
| time_token = link_string[:18] | |
| release_time = datetime.strptime(time_token, "%Y年%m月%d日 %H时%M分") | |
| shanghai = timezone('Asia/Shanghai') | |
| release_time = shanghai.localize(release_time) | |
| title_token = link_string[22:] | |
| title = title_token.strip() | |
| title = re.sub(r'\s+', ' ', title).strip() | |
| _, content = read_blog_page(blog_path) | |
| header = get_front_matter( | |
| title=title, | |
| date=release_time.strftime('%Y-%m-%dT%H:%M:%S%z') | |
| ) | |
| print(header) | |
| output_file_path = Path('./dist/output', "{time}-{title}.md".format( | |
| time=release_time.strftime("%Y-%m-%d"), | |
| title=title | |
| )) | |
| print(output_file_path) | |
| try: | |
| write_post(output_file_path, header, content) | |
| except OSError: | |
| output_file_path = Path('./dist/output', "{time}-{title}.md".format( | |
| time=release_time.strftime("%Y-%m-%d"), | |
| title=hashlib.sha224(title.encode('utf-8')).hexdigest() | |
| )) | |
| write_post(output_file_path, header, content) | |
| def read_blog_page(blog_page_path): | |
| with open(Path(blog_page_path), encoding='utf-8') as f: | |
| bs = BeautifulSoup(f, "html5lib") | |
| title = bs.title.string | |
| with open(Path(blog_page_path), encoding='utf-8') as f: | |
| page_html = f.read() | |
| text = html2text.html2text(page_html) | |
| return title, text | |
| def get_front_matter(**kwargs): | |
| header = frontmatter.Post(content='') | |
| for key in kwargs: | |
| header[key] = kwargs[key] | |
| return frontmatter.dumps(header) | |
| def write_post(path, header, content): | |
| with open(path, 'w', encoding='utf-8') as f: | |
| f.write(header) | |
| f.write('\n') | |
| f.write(content) | |
| if __name__ == "__main__": | |
| read_blog_index("some path") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment