Skip to content

Instantly share code, notes, and snippets.

@phofman
Created October 11, 2019 22:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save phofman/4774f2a5d00d9ee3eb82967a2ef8dc65 to your computer and use it in GitHub Desktop.
Save phofman/4774f2a5d00d9ee3eb82967a2ef8dc65 to your computer and use it in GitHub Desktop.
Porting posts from MiniBlog to Hugo (markdown)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from os import listdir
from os import mkdir
from os import makedirs
from os import path
import shutil
import xml.etree.ElementTree as xmltree
def get_image_new_location(date, image_name, alt):
year = date[:4]
month = date[5:7]
_, ext = path.splitext(image_name.lower())
new_name = alt.lower().replace(" ", "-").replace(".", "-")
return f"/{year}/{month}/{new_name}{ext}"
def render_image_md(image_name, alt):
return f"![{alt}]({{{{< media \"{image_name}\" >}}}})"
def render_url_md(url, alt):
if url.endswith(".png") or url.endswith(".jpg") or url.endswith(".gif"):
return f"![{alt}]({url})"
return f"[{alt}]({url})"
def render_post_md(url, alt):
return f"[{alt}]({{{{< ref \"{url}\" >}}}})"
def render_image(content, url, alt, tag_start, tag_end, date, images):
if "/blog/posts/files/" in url \
and (url.endswith(".png") or url.endswith(".jpg") or url.endswith(".gif")):
marker = "/blog/posts/files/"
start_at = url.index(marker) + len(marker)
image_name = url[start_at:]
updated_image_name = get_image_new_location(date, image_name, alt)
images.append({"name": updated_image_name, "org_name": image_name, "alt": alt})
return content[:tag_start] + render_image_md(updated_image_name, alt) + content[tag_end:]
if "/blog/post/" in url:
marker = "/blog/post/"
start_at = url.index(marker) + len(marker)
post_name = url[start_at:] + ".md"
return content[:tag_start] + render_post_md(post_name, alt) + content[tag_end:]
return content[:tag_start] + render_url_md(url, alt) + content[tag_end:]
def update_a_href(content, date, images):
l = content
a_start = l.find("<a")
if a_start < 0:
return content, False
a_marker = "</a>"
a_end = l.find(a_marker, a_start)
if a_end < 0:
a_marker = "</ a>"
a_end = l.find(a_marker, a_start)
if a_end < 0:
return content, False
href_start = l.find("href=\"", a_start, a_end)
if href_start < 0:
return content, False
href_start += 6
href_end = l.find("\"", href_start, a_end)
alt_start = l.find(">", href_end, a_end)
if alt_start < 0:
return content, False
alt_start += 1
uri = l[href_start:href_end]
alt = l[alt_start:a_end]
a_end += len(a_marker)
if "<img " in alt:
alt_start = alt.find("alt=\"")
if alt_start >= 0:
alt_start += 5
alt_end = alt.find("\"", alt_start)
alt = alt[alt_start:alt_end]
return render_image(l, uri, alt, a_start, a_end, date, images), True
def update_img_src(content, date, images):
img_start = content.find("<img ")
if img_start < 0:
return content, False
img_end = content.find(">", img_start)
if img_end < 0:
img_end = content.find("/>", img_start)
if img_end < 0:
return content, False
img_end += 1
img_end += 1
alt_start = content.find("alt=\"", img_start, img_end)
if alt_start < 0:
return content, False
alt_start += 5
alt_end = content.find("\"", alt_start, img_end)
src_start = content.find("src=\"", img_start, img_end)
if src_start < 0:
return content, False
src_start += 5
src_end = content.find("\"", src_start, img_end)
alt = content[alt_start:alt_end]
uri = content[src_start:src_end]
return render_image(content, uri, alt, img_start, img_end, date, images), True
def convert_content(content, date):
content = content.strip()
# remove paragraphs:
content = content.replace("<p>", "").replace("</p>", "\n\n")
# formatting:
content = content.replace("<strong>", "**").replace("</strong>", "**")
content = content.replace("“", "\"").replace("”", "\"")
content = content.replace("<em>", "_").replace("</em>", "_")
content = content.replace("<pre class=\"brush: shell\">", "```shell\n").replace("<pre class=\"brush:shell\">", "```shell\n")
content = content.replace("<pre class=\"brush: bash\">", "```shell\n").replace("<pre class=\"brush:bash;\">", "```shell\n")
content = content.replace("<pre class=\"brush: csharp\">", "```csharp\n").replace("<pre class=\"brush:csharp\">", "```csharp\n").replace("<pre class=\"brush: csharp;\">", "```csharp\n")
content = content.replace("<pre class=\"brush: cpp\">", "```cpp\n")
content = content.replace("</pre>", "\n```\n")
content = content.replace("<h1>", "\n# ").replace("</h1>", "\n\n")
content = content.replace("<h2>", "\n## ").replace("</h2>", "\n\n")
content = content.replace("&nbsp;", "")
# punctations:
content = content.replace("<ul>", "").replace("<ol>", "").replace("</ul>", "").replace("</ol>", "").replace("</li>", "")
content = content.replace("<li>", "\n* ")
# unify line endings:
content = content.replace("<br>", "\n").replace("<br/>", "\n").replace("<br />", "\n").replace("\r", "").strip()
# links:
images = []
while True:
c, updated = update_a_href(content, date, images)
if updated:
content = c
else:
break
content = content.replace("</img>", "")
while True:
c, updated = update_img_src(content, date, images)
if updated:
content = c
else:
break
lines = [l.rstrip() for l in content.split("\n")]
return ("\r\n".join(lines), images)
def convert_date(date):
return date.replace(" ", "T") + "+02:00"
def copy_images(output_path, files_path, images):
for image in images:
src_path = path.join(files_path, image["org_name"])
dst_path = path.join(output_path, "images" + image["name"])
dir = path.dirname(dst_path)
if not path.exists(dir):
makedirs(dir)
if path.exists(dst_path):
print(f"File exists: {dst_path}, ignoring!")
else:
shutil.copy(src_path, dst_path)
def save_result(output_path, files_path, slug, title, date, images, tags, content):
if not path.exists(output_path):
mkdir(output_path)
copy_images(output_path, files_path, images)
file = open(path.join(output_path, slug + ".md"), "w")
file.write("---\r\n")
file.write(f"title: \"{title}\"\r\n")
file.write(f"slug: \"{slug}\"\r\n")
file.write(f"date: {date}\r\n")
file.write(f"draft: false\r\n")
if len(tags) > 0:
file.write("\r\n")
file.write(f"tags: {tags}\r\n")
file.write("---\r\n\r\n")
file.write(content)
file.write("\r\n")
file.close()
def process_post(post_path, post_file_name, output_path, files_path):
post = xmltree.parse(post_path).getroot()
title = post.find("title").text
slug = post.find("slug").text
date = convert_date(post.find("pubDate").text)
(content, images) = convert_content(post.find("content").text, date)
tags = [c.text for c in post.findall("./categories/category")]
print(f" * {post_path}")
print(f" title: {title}")
print(f" slug: {slug}")
print(f" date: {date}")
print(f" tags: {tags}")
#print(f" content:\n{content}")
save_result(output_path, files_path, slug, title, date, images, tags, content)
def main():
current_path = path.dirname(path.realpath(__file__))
files_path = path.join(current_path, "files")
posts_path = path.join(current_path, "posts")
output_path = path.join(current_path, "output")
print(f"Converting old posts into Hugo format (source: '{posts_path}'')...")
posts = [f for f in listdir(posts_path) if path.isfile(path.join(posts_path, f))]
count = 0
for p in posts:
process_post(path.join(posts_path, p), p, output_path, files_path)
count += 1
print(f"[DONE:{count}]")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment