pvgenuchten/html-to-hugo.py

## html-to-hugo.py
# Goal of the script is export an existing site to hugo markdown
# use httrack or some other tool to download the full website
# then run this script to convert the local copy to markdown
# verify that the header and footer are properly removed (by setting the split point)
# copy the markdowns into the hugo content section


import os
from markdownify import markdownify as md

# loop through all folders and files
for root, dirs, files in os.walk('.'):
    level = root.replace('.', '').count(os.sep)
    indent = ' ' * 4 * (level)
    print('{}{}/'.format(indent, os.path.basename(root)))
    subindent = ' ' * 4 * (level + 1)
    for f in files:
      try:
        if  os.path.basename(f).split('.')[1] == 'html':
            # if file is html
            print('{}{}'.format(subindent, f))
            # open file
            File_object = open(root + os.sep + f, "r+", encoding="utf-8")
            page = ''.join(File_object.readlines())
            File_object.close()

            # strip header / footer
            try:
                main = page.split('<!-- /#content-header -->')[1].split('<!-- /#content -->')[0]
            except Exception as ex:
                print('fail split: '+ str(ex))
                break

            # fetch title
            try:
                ttl = md(main.spit('<h1>')[1].split('</h1>')[0])
            except:
                ttl = os.path.basename(f).split('.')[0]

            # markdownify https://github.com/matthewwithanm/python-markdownify
            out = md(main)

            # save as file.md
            pth = root + os.sep + os.path.basename(f).split('.')[0] + ".md"
            File_object = open(pth, "w", encoding="utf-8")
            File_object.write("---\ntitle: '{0}'\ndate: 2022-06-01\nicon: ''\ndraft: false\n---\n\n\n".format(ttl))
            File_object.write(out)
            File_object.close()
      except Exception as ex:
        print('Error: '+ str(ex))
	# Goal of the script is export an existing site to hugo markdown
	# use httrack or some other tool to download the full website
	# then run this script to convert the local copy to markdown
	# verify that the header and footer are properly removed (by setting the split point)
	# copy the markdowns into the hugo content section


	import os
	from markdownify import markdownify as md

	# loop through all folders and files
	for root, dirs, files in os.walk('.'):
	level = root.replace('.', '').count(os.sep)
	indent = ' ' * 4 * (level)
	print('{}{}/'.format(indent, os.path.basename(root)))
	subindent = ' ' * 4 * (level + 1)
	for f in files:
	try:
	if os.path.basename(f).split('.')[1] == 'html':
	# if file is html
	print('{}{}'.format(subindent, f))
	# open file
	File_object = open(root + os.sep + f, "r+", encoding="utf-8")
	page = ''.join(File_object.readlines())
	File_object.close()

	# strip header / footer
	try:
	main = page.split('<!-- /#content-header -->')[1].split('<!-- /#content -->')[0]
	except Exception as ex:
	print('fail split: '+ str(ex))
	break

	# fetch title
	try:
	ttl = md(main.spit('<h1>')[1].split('</h1>')[0])
	except:
	ttl = os.path.basename(f).split('.')[0]

	# markdownify https://github.com/matthewwithanm/python-markdownify
	out = md(main)

	# save as file.md
	pth = root + os.sep + os.path.basename(f).split('.')[0] + ".md"
	File_object = open(pth, "w", encoding="utf-8")
	File_object.write("---\ntitle: '{0}'\ndate: 2022-06-01\nicon: ''\ndraft: false\n---\n\n\n".format(ttl))
	File_object.write(out)
	File_object.close()
	except Exception as ex:
	print('Error: '+ str(ex))