phofman/convert.py

## convert.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from os import listdir
from os import mkdir
from os import makedirs
from os import path
import shutil
import xml.etree.ElementTree as xmltree

def get_image_new_location(date, image_name, alt):
    year = date[:4]
    month = date[5:7]

    _, ext = path.splitext(image_name.lower())
    new_name = alt.lower().replace(" ", "-").replace(".", "-")

    return f"/{year}/{month}/{new_name}{ext}"

def render_image_md(image_name, alt):
    return f"![{alt}]({{{{< media \"{image_name}\" >}}}})"

def render_url_md(url, alt):
    if url.endswith(".png") or url.endswith(".jpg") or url.endswith(".gif"):
        return f"![{alt}]({url})"

    return f"[{alt}]({url})"

def render_post_md(url, alt):
    return f"[{alt}]({{{{< ref \"{url}\" >}}}})"

def render_image(content, url, alt, tag_start, tag_end, date, images):
    if "/blog/posts/files/" in url \
        and (url.endswith(".png") or url.endswith(".jpg") or url.endswith(".gif")):
        marker = "/blog/posts/files/"
        start_at = url.index(marker) + len(marker)
        image_name = url[start_at:]
        updated_image_name = get_image_new_location(date, image_name, alt)
        images.append({"name": updated_image_name, "org_name": image_name, "alt": alt})

        return content[:tag_start] + render_image_md(updated_image_name, alt) + content[tag_end:]

    if "/blog/post/" in url:
        marker = "/blog/post/"
        start_at = url.index(marker) + len(marker)
        post_name = url[start_at:] + ".md"
        return content[:tag_start] + render_post_md(post_name, alt) + content[tag_end:]

    return content[:tag_start] + render_url_md(url, alt) + content[tag_end:]

def update_a_href(content, date, images):
    l = content
    a_start = l.find("<a")
    if a_start < 0:
        return content, False
    a_marker = "</a>"
    a_end = l.find(a_marker, a_start)
    if a_end < 0:
        a_marker = "</ a>"
        a_end = l.find(a_marker, a_start)
    if a_end < 0:
        return content, False
    href_start = l.find("href=\"", a_start, a_end)
    if href_start < 0:
        return content, False
    href_start += 6
    href_end = l.find("\"", href_start, a_end)
    alt_start = l.find(">", href_end, a_end)
    if alt_start < 0:
        return content, False
    alt_start += 1
    uri = l[href_start:href_end]
    alt = l[alt_start:a_end]
    a_end += len(a_marker)

    if "<img " in alt:
        alt_start = alt.find("alt=\"")
        if alt_start >= 0:
            alt_start += 5
            alt_end = alt.find("\"", alt_start)
            alt = alt[alt_start:alt_end]

    return render_image(l, uri, alt, a_start, a_end, date, images), True

def update_img_src(content, date, images):
    img_start = content.find("<img ")
    if img_start < 0:
        return content, False
    img_end = content.find(">", img_start)
    if img_end < 0:
        img_end = content.find("/>", img_start)
        if img_end < 0:
            return content, False
        img_end += 1
    img_end += 1

    alt_start = content.find("alt=\"", img_start, img_end)
    if alt_start < 0:
        return content, False
    alt_start += 5
    alt_end = content.find("\"", alt_start, img_end)

    src_start = content.find("src=\"", img_start, img_end)
    if src_start < 0:
        return content, False
    src_start += 5
    src_end = content.find("\"", src_start, img_end)

    alt = content[alt_start:alt_end]
    uri = content[src_start:src_end]

    return render_image(content, uri, alt, img_start, img_end, date, images), True

def convert_content(content, date):
    content = content.strip()

    # remove paragraphs:
    content = content.replace("<p>", "").replace("</p>", "\n\n")

    # formatting:
    content = content.replace("<strong>", "**").replace("</strong>", "**")
    content = content.replace("“", "\"").replace("”", "\"")
    content = content.replace("<em>", "_").replace("</em>", "_")
    content = content.replace("<pre class=\"brush: shell\">", "```shell\n").replace("<pre class=\"brush:shell\">", "```shell\n")
    content = content.replace("<pre class=\"brush: bash\">", "```shell\n").replace("<pre class=\"brush:bash;\">", "```shell\n")
    content = content.replace("<pre class=\"brush: csharp\">", "```csharp\n").replace("<pre class=\"brush:csharp\">", "```csharp\n").replace("<pre class=\"brush: csharp;\">", "```csharp\n")
    content = content.replace("<pre class=\"brush: cpp\">", "```cpp\n")
    content = content.replace("</pre>", "\n```\n")
    content = content.replace("<h1>", "\n# ").replace("</h1>", "\n\n")
    content = content.replace("<h2>", "\n## ").replace("</h2>", "\n\n")
    content = content.replace("&nbsp;", "")

    # punctations:
    content = content.replace("<ul>", "").replace("<ol>", "").replace("</ul>", "").replace("</ol>", "").replace("</li>", "")
    content = content.replace("<li>", "\n* ")

    # unify line endings:
    content = content.replace("<br>", "\n").replace("<br/>", "\n").replace("<br />", "\n").replace("\r", "").strip()

    # links:
    images = []
    while True:
        c, updated = update_a_href(content, date, images)
        if updated:
            content = c
        else:
            break

    content = content.replace("</img>", "")
    while True:
        c, updated = update_img_src(content, date, images)
        if updated:
            content = c
        else:
            break

    lines = [l.rstrip() for l in content.split("\n")]
    return ("\r\n".join(lines), images)

def convert_date(date):
    return date.replace(" ", "T") + "+02:00"

def copy_images(output_path, files_path, images):
    for image in images:
        src_path = path.join(files_path, image["org_name"])
        dst_path = path.join(output_path, "images" + image["name"])

        dir = path.dirname(dst_path)
        if not path.exists(dir):
            makedirs(dir)

        if path.exists(dst_path):
            print(f"File exists: {dst_path}, ignoring!")
        else:
            shutil.copy(src_path, dst_path)

def save_result(output_path, files_path, slug, title, date, images, tags, content):
    if not path.exists(output_path):
        mkdir(output_path)

    copy_images(output_path, files_path, images)

    file = open(path.join(output_path, slug + ".md"), "w")
    file.write("---\r\n")
    file.write(f"title: \"{title}\"\r\n")
    file.write(f"slug: \"{slug}\"\r\n")
    file.write(f"date: {date}\r\n")
    file.write(f"draft: false\r\n")
    if len(tags) > 0:
        file.write("\r\n")
        file.write(f"tags: {tags}\r\n")
    file.write("---\r\n\r\n")
    file.write(content)
    file.write("\r\n")
    file.close()

def process_post(post_path, post_file_name, output_path, files_path):
    post = xmltree.parse(post_path).getroot()
    title = post.find("title").text
    slug = post.find("slug").text
    date = convert_date(post.find("pubDate").text)
    (content, images) = convert_content(post.find("content").text, date)
    tags = [c.text for c in post.findall("./categories/category")]

    print(f" * {post_path}")
    print(f"    title: {title}")
    print(f"    slug: {slug}")
    print(f"    date: {date}")
    print(f"    tags: {tags}")
    #print(f"    content:\n{content}")

    save_result(output_path, files_path, slug, title, date, images, tags, content)

def main():
    current_path = path.dirname(path.realpath(__file__))
    files_path = path.join(current_path, "files")
    posts_path = path.join(current_path, "posts")
    output_path = path.join(current_path, "output")
    print(f"Converting old posts into Hugo format (source: '{posts_path}'')...")

    posts = [f for f in listdir(posts_path) if path.isfile(path.join(posts_path, f))]
    count = 0
    for p in posts:
        process_post(path.join(posts_path, p), p, output_path, files_path)
        count += 1
    print(f"[DONE:{count}]")

if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	# -- coding: utf-8 --

	from os import listdir
	from os import mkdir
	from os import makedirs
	from os import path
	import shutil
	import xml.etree.ElementTree as xmltree

	def get_image_new_location(date, image_name, alt):
	year = date[:4]
	month = date[5:7]

	_, ext = path.splitext(image_name.lower())
	new_name = alt.lower().replace(" ", "-").replace(".", "-")

	return f"/{year}/{month}/{new_name}{ext}"

	def render_image_md(image_name, alt):
	return f"![{alt}]({{{{< media \"{image_name}\" >}}}})"

	def render_url_md(url, alt):
	if url.endswith(".png") or url.endswith(".jpg") or url.endswith(".gif"):
	return f"![{alt}]({url})"

	return f"[{alt}]({url})"

	def render_post_md(url, alt):
	return f"[{alt}]({{{{< ref \"{url}\" >}}}})"

	def render_image(content, url, alt, tag_start, tag_end, date, images):
	if "/blog/posts/files/" in url \
	and (url.endswith(".png") or url.endswith(".jpg") or url.endswith(".gif")):
	marker = "/blog/posts/files/"
	start_at = url.index(marker) + len(marker)
	image_name = url[start_at:]
	updated_image_name = get_image_new_location(date, image_name, alt)
	images.append({"name": updated_image_name, "org_name": image_name, "alt": alt})

	return content[:tag_start] + render_image_md(updated_image_name, alt) + content[tag_end:]

	if "/blog/post/" in url:
	marker = "/blog/post/"
	start_at = url.index(marker) + len(marker)
	post_name = url[start_at:] + ".md"
	return content[:tag_start] + render_post_md(post_name, alt) + content[tag_end:]

	return content[:tag_start] + render_url_md(url, alt) + content[tag_end:]

	def update_a_href(content, date, images):
	l = content
	a_start = l.find("<a")
	if a_start < 0:
	return content, False
	a_marker = "</a>"
	a_end = l.find(a_marker, a_start)
	if a_end < 0:
	a_marker = "</ a>"
	a_end = l.find(a_marker, a_start)
	if a_end < 0:
	return content, False
	href_start = l.find("href=\"", a_start, a_end)
	if href_start < 0:
	return content, False
	href_start += 6
	href_end = l.find("\"", href_start, a_end)
	alt_start = l.find(">", href_end, a_end)
	if alt_start < 0:
	return content, False
	alt_start += 1
	uri = l[href_start:href_end]
	alt = l[alt_start:a_end]
	a_end += len(a_marker)

	if "<img " in alt:
	alt_start = alt.find("alt=\"")
	if alt_start >= 0:
	alt_start += 5
	alt_end = alt.find("\"", alt_start)
	alt = alt[alt_start:alt_end]

	return render_image(l, uri, alt, a_start, a_end, date, images), True

	def update_img_src(content, date, images):
	img_start = content.find("<img ")
	if img_start < 0:
	return content, False
	img_end = content.find(">", img_start)
	if img_end < 0:
	img_end = content.find("/>", img_start)
	if img_end < 0:
	return content, False
	img_end += 1
	img_end += 1

	alt_start = content.find("alt=\"", img_start, img_end)
	if alt_start < 0:
	return content, False
	alt_start += 5
	alt_end = content.find("\"", alt_start, img_end)

	src_start = content.find("src=\"", img_start, img_end)
	if src_start < 0:
	return content, False
	src_start += 5
	src_end = content.find("\"", src_start, img_end)

	alt = content[alt_start:alt_end]
	uri = content[src_start:src_end]

	return render_image(content, uri, alt, img_start, img_end, date, images), True

	def convert_content(content, date):
	content = content.strip()

	# remove paragraphs:
	content = content.replace("<p>", "").replace("</p>", "\n\n")

	# formatting:
	content = content.replace("<strong>", "").replace("</strong>", "")
	content = content.replace("“", "\"").replace("”", "\"")
	content = content.replace("<em>", "_").replace("</em>", "_")
	content = content.replace("<pre class=\"brush: shell\">", "```shell\n").replace("<pre class=\"brush:shell\">", "```shell\n")
	content = content.replace("<pre class=\"brush: bash\">", "```shell\n").replace("<pre class=\"brush:bash;\">", "```shell\n")
	content = content.replace("<pre class=\"brush: csharp\">", "```csharp\n").replace("<pre class=\"brush:csharp\">", "```csharp\n").replace("<pre class=\"brush: csharp;\">", "```csharp\n")
	content = content.replace("<pre class=\"brush: cpp\">", "```cpp\n")
	content = content.replace("</pre>", "\n```\n")
	content = content.replace("<h1>", "\n# ").replace("</h1>", "\n\n")
	content = content.replace("<h2>", "\n## ").replace("</h2>", "\n\n")
	content = content.replace(" ", "")

	# punctations:
	content = content.replace("<ul>", "").replace("<ol>", "").replace("</ul>", "").replace("</ol>", "").replace("</li>", "")
	content = content.replace("<li>", "\n* ")

	# unify line endings:
	content = content.replace("<br>", "\n").replace("<br/>", "\n").replace("<br />", "\n").replace("\r", "").strip()

	# links:
	images = []
	while True:
	c, updated = update_a_href(content, date, images)
	if updated:
	content = c
	else:
	break

	content = content.replace("</img>", "")
	while True:
	c, updated = update_img_src(content, date, images)
	if updated:
	content = c
	else:
	break

	lines = [l.rstrip() for l in content.split("\n")]
	return ("\r\n".join(lines), images)

	def convert_date(date):
	return date.replace(" ", "T") + "+02:00"

	def copy_images(output_path, files_path, images):
	for image in images:
	src_path = path.join(files_path, image["org_name"])
	dst_path = path.join(output_path, "images" + image["name"])

	dir = path.dirname(dst_path)
	if not path.exists(dir):
	makedirs(dir)

	if path.exists(dst_path):
	print(f"File exists: {dst_path}, ignoring!")
	else:
	shutil.copy(src_path, dst_path)

	def save_result(output_path, files_path, slug, title, date, images, tags, content):
	if not path.exists(output_path):
	mkdir(output_path)

	copy_images(output_path, files_path, images)

	file = open(path.join(output_path, slug + ".md"), "w")
	file.write("---\r\n")
	file.write(f"title: \"{title}\"\r\n")
	file.write(f"slug: \"{slug}\"\r\n")
	file.write(f"date: {date}\r\n")
	file.write(f"draft: false\r\n")
	if len(tags) > 0:
	file.write("\r\n")
	file.write(f"tags: {tags}\r\n")
	file.write("---\r\n\r\n")
	file.write(content)
	file.write("\r\n")
	file.close()

	def process_post(post_path, post_file_name, output_path, files_path):
	post = xmltree.parse(post_path).getroot()
	title = post.find("title").text
	slug = post.find("slug").text
	date = convert_date(post.find("pubDate").text)
	(content, images) = convert_content(post.find("content").text, date)
	tags = [c.text for c in post.findall("./categories/category")]

	print(f" * {post_path}")
	print(f" title: {title}")
	print(f" slug: {slug}")
	print(f" date: {date}")
	print(f" tags: {tags}")
	#print(f" content:\n{content}")

	save_result(output_path, files_path, slug, title, date, images, tags, content)

	def main():
	current_path = path.dirname(path.realpath(__file__))
	files_path = path.join(current_path, "files")
	posts_path = path.join(current_path, "posts")
	output_path = path.join(current_path, "output")
	print(f"Converting old posts into Hugo format (source: '{posts_path}'')...")

	posts = [f for f in listdir(posts_path) if path.isfile(path.join(posts_path, f))]
	count = 0
	for p in posts:
	process_post(path.join(posts_path, p), p, output_path, files_path)
	count += 1
	print(f"[DONE:{count}]")

	if __name__ == "__main__":
	main()