Skip to content

Instantly share code, notes, and snippets.

Last active June 30, 2022 02:21
Show Gist options
  • Save stecman/18b8d74fddedaf9d93d7f944cac04fc2 to your computer and use it in GitHub Desktop.
Save stecman/18b8d74fddedaf9d93d7f944cac04fc2 to your computer and use it in GitHub Desktop. article to markdown converter for migrating or backing up projects/posts
#!/usr/bin/env python3
# Convert hackaday posts to markdown with images stored nearby
# This needs the following modules to run:
# -
# -
# -
from bs4 import BeautifulSoup
from markdownify import MarkdownConverter
import datetime
import logging
import os
import re
import requests
import sys
class HackadayMarkdownConverter(MarkdownConverter):
Modified markdown converter to handle specifics in articles
def process_tag(self, node, children_only=False):
Remove spaces added at the start of some paragraphs
return re.sub(
r'(^|\n)[ \t]+([^\s])',
super().process_tag(node, children_only)
def convert_br(self, el, text):
# Don't honour forced breaks
return ''
def convert_figcaption(self, el, text):
Wrap figcaption text in a <caption> element to differentiate from body text
if text:
return '<caption>' + text + '</caption>\n\n'
return ''
def convert_table(self, el, text):
Dump tables as HTML in the source
(This markdown converter doesn't support tables)
return el.prettify() + '\n'
def convert_figure(self, el, text):
Handle <figure> elements as block images with a possible caption
md = ''
if el.find('img'):
md += self.convert_img(el.find('img'), None) + '\n\n'
if el.find('figcaption'):
captionEl = el.find('figcaption')
md += self.convert_figcaption(captionEl, captionEl.get_text())
return md
def htmlToMd(html):
Convert an HTML string to markdown
return HackadayMarkdownConverter(heading_style='atx', bullets='-').convert(html)
def getLastPathSegment(url):
Grab the last path segment from a URL
return next(re.finditer(r'https://.*/([^/?]+)', url)).group(1)
def fetchArticle(url):
Get the HTML content from a post with other primary fields
req = requests.get(url)
rawHtml = req.content.decode('utf-8')
# BeautifulSoup doesn't handle all HTML entities correctly - replace them manually before reading
rawHtml = rawHtml.replace('&apos;', "'")
html = BeautifulSoup(rawHtml, 'html.parser')
title = html.select_one('.headline h1')
content = html.select_one('.post-content')
# Extract publish date/time (always a US format date)
publishDate = datetime.datetime.strptime(
'%m/%d/%Y at %H:%M'
return {
'metadata': {
'date': publishDate,
'slug': getLastPathSegment(url),
'original_url': url,
'title': title.get_text(),
'content': content,
def findImageUrl(htmlNode):
Given an HTML image node, return the best URL for the content
Hackaday images are usually lazy loaded using the URL from data-src
attributes = ['src', 'data-src']
for attr in attributes:
if attr in htmlNode.attrs:
return htmlNode.attrs[attr]
raise Exception('Failed to find src attribute for image node: ' + str(htmlNode))
def downloadFile(url):
Download a file to disk using the filename from the URL
Returns the filename of the downloaded file
outputFile = getLastPathSegment(url)'Saving file %s as %s' % (url, outputFile))
req = requests.get(url, allow_redirects=True)
open(outputFile, 'wb').write(req.content)
return outputFile
def writeFrontMattter(handle, data):
for key in data.keys():
handle.write('%s: %s\n' % (key, data[key]))
def savePost(url, force=False, keepHtml=True):
Download a post and all of its images
Saves files to the current working directory
source = fetchArticle(url)
# Strip article ID for markdown filename
outputName = re.sub(r'^\d+-', '', source['metadata']['slug'])
articlePath = outputName + '.md'
htmlPath = '_' + outputName + '.original.html'
if not force and os.path.exists(articlePath):'Output file "%s" for url %s already exists!' % (articlePath, url))'Refusing to overwrite existing file without --force')
content = source['content']
# Find and download images in the content
# Once downloaded the URL is replaced with a relative path to the file on disk
for image in content.find_all('img'):
image.attrs['src'] = downloadFile( findImageUrl(image) )
title = '# %s\n\n' % source['title']
htmlStr = content.encode(formatter='html5').decode('utf-8')
markdown = htmlToMd(htmlStr)
with open(articlePath, mode='w', encoding='utf-8') as handle:
writeFrontMattter(handle, source['metadata'])
# Output original HTML with image paths changed
if keepHtml:
with open(htmlPath, mode='w', encoding='utf-8') as handle:
if __name__ == '__main__':
import argparse
# Enable logging output
logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
parser = argparse.ArgumentParser(description='Convert a post to markdown')
parser.add_argument('url', nargs='+', help='Post URL')
parser.add_argument('--force', '-f', action='store_true', help='Overwrite if the file already exists')
parser.add_argument('--auto-dir', '-d', action='store_true', help='Place in a directory based on the URL')
args = parser.parse_args()
for url in args.url:
# Sanity check URL we've been given
if "" not in url:
logging.warning('This does not look like a URL: ' + url)
if args.auto_dir:
# Get slug without article ID
slug = re.sub(r'^\d+-', '', getLastPathSegment(url))
# Find current highest local dir number
localId = 0
prefix = r'^(\d+)-'
for name in os.listdir():
if re.match(prefix, name):
idPrefix = re.findall(prefix, name)[0]
localId = max(localId, int(idPrefix))
# Increment to get the next local dir number
localId += 1
# Create a dir in sequence from slug
dirname = '%03d-%s' % (localId, slug)
print("Downloading to dir: " + dirname)
savePost(url, force=args.force)
if args.auto_dir:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment