Skip to content

Instantly share code, notes, and snippets.

@pvgenuchten
Last active August 31, 2022 12:42
Show Gist options
  • Save pvgenuchten/3cf227a5bb409e5d91f459b55738566d to your computer and use it in GitHub Desktop.
Save pvgenuchten/3cf227a5bb409e5d91f459b55738566d to your computer and use it in GitHub Desktop.
# Goal of the script is export an existing site to hugo markdown
# use httrack or some other tool to download the full website
# then run this script to convert the local copy to markdown
# verify that the header and footer are properly removed (by setting the split point)
# copy the markdowns into the hugo content section
import os
from markdownify import markdownify as md
# loop through all folders and files
for root, dirs, files in os.walk('.'):
level = root.replace('.', '').count(os.sep)
indent = ' ' * 4 * (level)
print('{}{}/'.format(indent, os.path.basename(root)))
subindent = ' ' * 4 * (level + 1)
for f in files:
try:
if os.path.basename(f).split('.')[1] == 'html':
# if file is html
print('{}{}'.format(subindent, f))
# open file
File_object = open(root + os.sep + f, "r+", encoding="utf-8")
page = ''.join(File_object.readlines())
File_object.close()
# strip header / footer
try:
main = page.split('<!-- /#content-header -->')[1].split('<!-- /#content -->')[0]
except Exception as ex:
print('fail split: '+ str(ex))
break
# fetch title
try:
ttl = md(main.spit('<h1>')[1].split('</h1>')[0])
except:
ttl = os.path.basename(f).split('.')[0]
# markdownify https://github.com/matthewwithanm/python-markdownify
out = md(main)
# save as file.md
pth = root + os.sep + os.path.basename(f).split('.')[0] + ".md"
File_object = open(pth, "w", encoding="utf-8")
File_object.write("---\ntitle: '{0}'\ndate: 2022-06-01\nicon: ''\ndraft: false\n---\n\n\n".format(ttl))
File_object.write(out)
File_object.close()
except Exception as ex:
print('Error: '+ str(ex))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment