Created
October 11, 2019 22:25
-
-
Save phofman/4774f2a5d00d9ee3eb82967a2ef8dc65 to your computer and use it in GitHub Desktop.
Porting posts from MiniBlog to Hugo (markdown)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
from os import listdir | |
from os import mkdir | |
from os import makedirs | |
from os import path | |
import shutil | |
import xml.etree.ElementTree as xmltree | |
def get_image_new_location(date, image_name, alt): | |
year = date[:4] | |
month = date[5:7] | |
_, ext = path.splitext(image_name.lower()) | |
new_name = alt.lower().replace(" ", "-").replace(".", "-") | |
return f"/{year}/{month}/{new_name}{ext}" | |
def render_image_md(image_name, alt): | |
return f"![{alt}]({{{{< media \"{image_name}\" >}}}})" | |
def render_url_md(url, alt): | |
if url.endswith(".png") or url.endswith(".jpg") or url.endswith(".gif"): | |
return f"![{alt}]({url})" | |
return f"[{alt}]({url})" | |
def render_post_md(url, alt): | |
return f"[{alt}]({{{{< ref \"{url}\" >}}}})" | |
def render_image(content, url, alt, tag_start, tag_end, date, images): | |
if "/blog/posts/files/" in url \ | |
and (url.endswith(".png") or url.endswith(".jpg") or url.endswith(".gif")): | |
marker = "/blog/posts/files/" | |
start_at = url.index(marker) + len(marker) | |
image_name = url[start_at:] | |
updated_image_name = get_image_new_location(date, image_name, alt) | |
images.append({"name": updated_image_name, "org_name": image_name, "alt": alt}) | |
return content[:tag_start] + render_image_md(updated_image_name, alt) + content[tag_end:] | |
if "/blog/post/" in url: | |
marker = "/blog/post/" | |
start_at = url.index(marker) + len(marker) | |
post_name = url[start_at:] + ".md" | |
return content[:tag_start] + render_post_md(post_name, alt) + content[tag_end:] | |
return content[:tag_start] + render_url_md(url, alt) + content[tag_end:] | |
def update_a_href(content, date, images): | |
l = content | |
a_start = l.find("<a") | |
if a_start < 0: | |
return content, False | |
a_marker = "</a>" | |
a_end = l.find(a_marker, a_start) | |
if a_end < 0: | |
a_marker = "</ a>" | |
a_end = l.find(a_marker, a_start) | |
if a_end < 0: | |
return content, False | |
href_start = l.find("href=\"", a_start, a_end) | |
if href_start < 0: | |
return content, False | |
href_start += 6 | |
href_end = l.find("\"", href_start, a_end) | |
alt_start = l.find(">", href_end, a_end) | |
if alt_start < 0: | |
return content, False | |
alt_start += 1 | |
uri = l[href_start:href_end] | |
alt = l[alt_start:a_end] | |
a_end += len(a_marker) | |
if "<img " in alt: | |
alt_start = alt.find("alt=\"") | |
if alt_start >= 0: | |
alt_start += 5 | |
alt_end = alt.find("\"", alt_start) | |
alt = alt[alt_start:alt_end] | |
return render_image(l, uri, alt, a_start, a_end, date, images), True | |
def update_img_src(content, date, images): | |
img_start = content.find("<img ") | |
if img_start < 0: | |
return content, False | |
img_end = content.find(">", img_start) | |
if img_end < 0: | |
img_end = content.find("/>", img_start) | |
if img_end < 0: | |
return content, False | |
img_end += 1 | |
img_end += 1 | |
alt_start = content.find("alt=\"", img_start, img_end) | |
if alt_start < 0: | |
return content, False | |
alt_start += 5 | |
alt_end = content.find("\"", alt_start, img_end) | |
src_start = content.find("src=\"", img_start, img_end) | |
if src_start < 0: | |
return content, False | |
src_start += 5 | |
src_end = content.find("\"", src_start, img_end) | |
alt = content[alt_start:alt_end] | |
uri = content[src_start:src_end] | |
return render_image(content, uri, alt, img_start, img_end, date, images), True | |
def convert_content(content, date): | |
content = content.strip() | |
# remove paragraphs: | |
content = content.replace("<p>", "").replace("</p>", "\n\n") | |
# formatting: | |
content = content.replace("<strong>", "**").replace("</strong>", "**") | |
content = content.replace("“", "\"").replace("”", "\"") | |
content = content.replace("<em>", "_").replace("</em>", "_") | |
content = content.replace("<pre class=\"brush: shell\">", "```shell\n").replace("<pre class=\"brush:shell\">", "```shell\n") | |
content = content.replace("<pre class=\"brush: bash\">", "```shell\n").replace("<pre class=\"brush:bash;\">", "```shell\n") | |
content = content.replace("<pre class=\"brush: csharp\">", "```csharp\n").replace("<pre class=\"brush:csharp\">", "```csharp\n").replace("<pre class=\"brush: csharp;\">", "```csharp\n") | |
content = content.replace("<pre class=\"brush: cpp\">", "```cpp\n") | |
content = content.replace("</pre>", "\n```\n") | |
content = content.replace("<h1>", "\n# ").replace("</h1>", "\n\n") | |
content = content.replace("<h2>", "\n## ").replace("</h2>", "\n\n") | |
content = content.replace(" ", "") | |
# punctations: | |
content = content.replace("<ul>", "").replace("<ol>", "").replace("</ul>", "").replace("</ol>", "").replace("</li>", "") | |
content = content.replace("<li>", "\n* ") | |
# unify line endings: | |
content = content.replace("<br>", "\n").replace("<br/>", "\n").replace("<br />", "\n").replace("\r", "").strip() | |
# links: | |
images = [] | |
while True: | |
c, updated = update_a_href(content, date, images) | |
if updated: | |
content = c | |
else: | |
break | |
content = content.replace("</img>", "") | |
while True: | |
c, updated = update_img_src(content, date, images) | |
if updated: | |
content = c | |
else: | |
break | |
lines = [l.rstrip() for l in content.split("\n")] | |
return ("\r\n".join(lines), images) | |
def convert_date(date): | |
return date.replace(" ", "T") + "+02:00" | |
def copy_images(output_path, files_path, images): | |
for image in images: | |
src_path = path.join(files_path, image["org_name"]) | |
dst_path = path.join(output_path, "images" + image["name"]) | |
dir = path.dirname(dst_path) | |
if not path.exists(dir): | |
makedirs(dir) | |
if path.exists(dst_path): | |
print(f"File exists: {dst_path}, ignoring!") | |
else: | |
shutil.copy(src_path, dst_path) | |
def save_result(output_path, files_path, slug, title, date, images, tags, content): | |
if not path.exists(output_path): | |
mkdir(output_path) | |
copy_images(output_path, files_path, images) | |
file = open(path.join(output_path, slug + ".md"), "w") | |
file.write("---\r\n") | |
file.write(f"title: \"{title}\"\r\n") | |
file.write(f"slug: \"{slug}\"\r\n") | |
file.write(f"date: {date}\r\n") | |
file.write(f"draft: false\r\n") | |
if len(tags) > 0: | |
file.write("\r\n") | |
file.write(f"tags: {tags}\r\n") | |
file.write("---\r\n\r\n") | |
file.write(content) | |
file.write("\r\n") | |
file.close() | |
def process_post(post_path, post_file_name, output_path, files_path): | |
post = xmltree.parse(post_path).getroot() | |
title = post.find("title").text | |
slug = post.find("slug").text | |
date = convert_date(post.find("pubDate").text) | |
(content, images) = convert_content(post.find("content").text, date) | |
tags = [c.text for c in post.findall("./categories/category")] | |
print(f" * {post_path}") | |
print(f" title: {title}") | |
print(f" slug: {slug}") | |
print(f" date: {date}") | |
print(f" tags: {tags}") | |
#print(f" content:\n{content}") | |
save_result(output_path, files_path, slug, title, date, images, tags, content) | |
def main(): | |
current_path = path.dirname(path.realpath(__file__)) | |
files_path = path.join(current_path, "files") | |
posts_path = path.join(current_path, "posts") | |
output_path = path.join(current_path, "output") | |
print(f"Converting old posts into Hugo format (source: '{posts_path}'')...") | |
posts = [f for f in listdir(posts_path) if path.isfile(path.join(posts_path, f))] | |
count = 0 | |
for p in posts: | |
process_post(path.join(posts_path, p), p, output_path, files_path) | |
count += 1 | |
print(f"[DONE:{count}]") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment