Skip to content

Instantly share code, notes, and snippets.

@alexandruc
Created December 18, 2022 17:07
Show Gist options
  • Save alexandruc/5f30ac3be02ae8464a4060b47f9edb40 to your computer and use it in GitHub Desktop.
Save alexandruc/5f30ac3be02ae8464a4060b47f9edb40 to your computer and use it in GitHub Desktop.
# Small script to transform hugo markdown files with html to pure markdown
import requests
import os
import dateparser
url = "https://tools.atatus.com/tools/html-to-markdown"
working_dir = "location of the hugo blog posts"
posts = os.listdir(working_dir)
def get_post_date(post):
date = post.split("-")
date = "-".join(date[:3])
return dateparser.parse(date)
# some filter that i needed at some point, can be removed
posts_to_transform = sorted([p for p in posts if get_post_date(p).year <= 2017 and get_post_date(p)>=dateparser.parse("2015-09-27")])
for fname in posts_to_transform:
file_path = os.path.join(working_dir, fname)
with open(file_path, "r") as f:
content = f.read()
header_limit = "+++"
start_header = content.find(header_limit)
end_header = content.find(header_limit, start_header+len(header_limit))
header = content[:end_header+len(header_limit)]
post_content = content[end_header+len(header_limit):]
res = requests.post(url, {"html": post_content.strip()})
transformed_content = res.text.replace("\\*", "*")
transformed_content = transformed_content.replace("\\_", "_")
transformed_content = transformed_content.replace("\\[", "[")
transformed_content = transformed_content.replace("\\]", "]")
transformed_content = transformed_content.replace("\\{", "}")
transformed_content = transformed_content.replace("\\}", "}")
transformed_post = header + "\n\n" + transformed_content
with open(file_path, "w") as f:
f.write(transformed_post)
print(f"transformed: {fname}")
print("Done")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment