stecman/hackaday-to-md.py

## hackaday-to-md.py
#!/usr/bin/env python3

# Convert hackaday posts to markdown with images stored nearby
#
# This needs the following modules to run:
#
#  - https://github.com/matthewwithanm/python-markdownify
#  - https://2.python-requests.org/en/master/
#  - https://www.crummy.com/software/BeautifulSoup/

from bs4 import BeautifulSoup
from markdownify import MarkdownConverter

import datetime
import logging
import os
import re
import requests
import sys

class HackadayMarkdownConverter(MarkdownConverter):
    '''
    Modified markdown converter to handle specifics in Hackaday.io articles
    '''

    def process_tag(self, node, children_only=False):
        '''
        Remove spaces added at the start of some paragraphs
        '''
        return re.sub(
            r'(^|\n)[ \t]+([^\s])',
            r'\1\2',
            super().process_tag(node, children_only)
        )

    def convert_br(self, el, text):
        # Don't honour forced breaks
        return ''

    def convert_figcaption(self, el, text):
        '''
        Wrap figcaption text in a <caption> element to differentiate from body text
        '''
        if text:
            return '<caption>' + text + '</caption>\n\n'
        else:
            return ''

    def convert_table(self, el, text):
        '''
        Dump tables as HTML in the source
        (This markdown converter doesn't support tables)
        '''
        return el.prettify() + '\n'

    def convert_figure(self, el, text):
        '''
        Handle <figure> elements as block images with a possible caption
        '''
        md = ''

        if el.find('img'):
            md += self.convert_img(el.find('img'), None) + '\n\n'

        if el.find('figcaption'):
            captionEl = el.find('figcaption')
            md += self.convert_figcaption(captionEl, captionEl.get_text())

        return md

def htmlToMd(html):
    '''
    Convert an HTML string to markdown
    '''
    return HackadayMarkdownConverter(heading_style='atx', bullets='-').convert(html)

def getLastPathSegment(url):
    '''
    Grab the last path segment from a URL
    '''
    return next(re.finditer(r'https://.*/([^/?]+)', url)).group(1)

def fetchArticle(url):
    '''
    Get the HTML content from a hackaday.io post with other primary fields
    '''
    req = requests.get(url)
    rawHtml = req.content.decode('utf-8')

    # BeautifulSoup doesn't handle all HTML entities correctly - replace them manually before reading
    rawHtml = rawHtml.replace('&apos;', "'")

    html = BeautifulSoup(rawHtml, 'html.parser')

    title = html.select_one('.headline h1')
    content = html.select_one('.post-content')

    # Extract publish date/time (always a US format date)
    publishDate = datetime.datetime.strptime(
        html.select_one('.time-card').get_text(),
        '%m/%d/%Y at %H:%M'
    )

    return {
        'metadata': {
            'date': publishDate,
            'slug': getLastPathSegment(url),
            'original_url': url,
        },
        'title': title.get_text(),
        'content': content,
    }

def findImageUrl(htmlNode):
    '''
    Given an HTML image node, return the best URL for the content

    Hackaday images are usually lazy loaded using the URL from data-src
    '''
    attributes = ['src', 'data-src']

    for attr in attributes:
        if attr in htmlNode.attrs:
            return htmlNode.attrs[attr]

    raise Exception('Failed to find src attribute for image node: ' + str(htmlNode))

def downloadFile(url):
    '''
    Download a file to disk using the filename from the URL

    Returns the filename of the downloaded file
    '''
    outputFile = getLastPathSegment(url)

    logging.info('Saving file %s as %s' % (url, outputFile))

    req = requests.get(url, allow_redirects=True)
    open(outputFile, 'wb').write(req.content)

    return outputFile

def writeFrontMattter(handle, data):
    handle.write('---\n')

    for key in data.keys():
        handle.write('%s: %s\n' % (key, data[key]))

    handle.write('---\n\n')

def savePost(url, force=False, keepHtml=True):
    '''
    Download a hackaday.io post and all of its images

    Saves files to the current working directory
    '''

    source = fetchArticle(url)

    # Strip article ID for markdown filename
    outputName = re.sub(r'^\d+-', '', source['metadata']['slug'])


    articlePath = outputName + '.md'
    htmlPath = '_' + outputName + '.original.html'

    if not force and os.path.exists(articlePath):
        logging.info('Output file "%s" for url %s already exists!' % (articlePath, url))
        logging.info('Refusing to overwrite existing file without --force')
        return

    content = source['content']

    # Find and download images in the content
    # Once downloaded the URL is replaced with a relative path to the file on disk
    for image in content.find_all('img'):
        image.attrs['src'] = downloadFile( findImageUrl(image) )

    title = '# %s\n\n' % source['title']
    htmlStr = content.encode(formatter='html5').decode('utf-8')
    markdown = htmlToMd(htmlStr)

    with open(articlePath, mode='w', encoding='utf-8') as handle:
        writeFrontMattter(handle, source['metadata'])

        handle.write(title)
        handle.write(markdown)

    # Output original HTML with image paths changed
    if keepHtml:
        with open(htmlPath, mode='w', encoding='utf-8') as handle:
            handle.write(htmlStr)


if __name__ == '__main__':
    import argparse

    # Enable logging output
    logging.basicConfig(stream=sys.stdout, level=logging.WARNING)

    parser = argparse.ArgumentParser(description='Convert a hackaday.io post to markdown')
    parser.add_argument('url', nargs='+', help='Post URL')
    parser.add_argument('--force', '-f',  action='store_true', help='Overwrite if the file already exists')
    parser.add_argument('--auto-dir', '-d', action='store_true', help='Place in a directory based on the URL')
    args = parser.parse_args()

    for url in args.url:

        # Sanity check URL we've been given
        if "https://hackaday.io" not in url:
            logging.warning('This does not look like a hackaday.io URL: ' + url)

        if args.auto_dir:
            # Get slug without article ID
            slug = re.sub(r'^\d+-', '', getLastPathSegment(url))

            # Find current highest local dir number
            localId = 0
            prefix = r'^(\d+)-'
            for name in os.listdir():
                if re.match(prefix, name):
                    idPrefix = re.findall(prefix, name)[0]
                    localId = max(localId, int(idPrefix))

            # Increment to get the next local dir number
            localId += 1

            # Create a dir in sequence from slug
            dirname =  '%03d-%s' % (localId, slug)

            print("Downloading to dir: " + dirname)
            os.mkdir(dirname)
            os.chdir(dirname)

        savePost(url, force=args.force)

        if args.auto_dir:
            os.chdir('..')
	#!/usr/bin/env python3

	# Convert hackaday posts to markdown with images stored nearby
	#
	# This needs the following modules to run:
	#
	# - https://github.com/matthewwithanm/python-markdownify
	# - https://2.python-requests.org/en/master/
	# - https://www.crummy.com/software/BeautifulSoup/

	from bs4 import BeautifulSoup
	from markdownify import MarkdownConverter

	import datetime
	import logging
	import os
	import re
	import requests
	import sys

	class HackadayMarkdownConverter(MarkdownConverter):
	'''
	Modified markdown converter to handle specifics in Hackaday.io articles
	'''

	def process_tag(self, node, children_only=False):
	'''
	Remove spaces added at the start of some paragraphs
	'''
	return re.sub(
	r'(^\|\n)[ \t]+([^\s])',
	r'\1\2',
	super().process_tag(node, children_only)
	)

	def convert_br(self, el, text):
	# Don't honour forced breaks
	return ''

	def convert_figcaption(self, el, text):
	'''
	Wrap figcaption text in a <caption> element to differentiate from body text
	'''
	if text:
	return '<caption>' + text + '</caption>\n\n'
	else:
	return ''

	def convert_table(self, el, text):
	'''
	Dump tables as HTML in the source
	(This markdown converter doesn't support tables)
	'''
	return el.prettify() + '\n'

	def convert_figure(self, el, text):
	'''
	Handle <figure> elements as block images with a possible caption
	'''
	md = ''

	if el.find('img'):
	md += self.convert_img(el.find('img'), None) + '\n\n'

	if el.find('figcaption'):
	captionEl = el.find('figcaption')
	md += self.convert_figcaption(captionEl, captionEl.get_text())

	return md

	def htmlToMd(html):
	'''
	Convert an HTML string to markdown
	'''
	return HackadayMarkdownConverter(heading_style='atx', bullets='-').convert(html)

	def getLastPathSegment(url):
	'''
	Grab the last path segment from a URL
	'''
	return next(re.finditer(r'https://.*/([^/?]+)', url)).group(1)

	def fetchArticle(url):
	'''
	Get the HTML content from a hackaday.io post with other primary fields
	'''
	req = requests.get(url)
	rawHtml = req.content.decode('utf-8')

	# BeautifulSoup doesn't handle all HTML entities correctly - replace them manually before reading
	rawHtml = rawHtml.replace(''', "'")

	html = BeautifulSoup(rawHtml, 'html.parser')

	title = html.select_one('.headline h1')
	content = html.select_one('.post-content')

	# Extract publish date/time (always a US format date)
	publishDate = datetime.datetime.strptime(
	html.select_one('.time-card').get_text(),
	'%m/%d/%Y at %H:%M'
	)

	return {
	'metadata': {
	'date': publishDate,
	'slug': getLastPathSegment(url),
	'original_url': url,
	},
	'title': title.get_text(),
	'content': content,
	}

	def findImageUrl(htmlNode):
	'''
	Given an HTML image node, return the best URL for the content

	Hackaday images are usually lazy loaded using the URL from data-src
	'''
	attributes = ['src', 'data-src']

	for attr in attributes:
	if attr in htmlNode.attrs:
	return htmlNode.attrs[attr]

	raise Exception('Failed to find src attribute for image node: ' + str(htmlNode))

	def downloadFile(url):
	'''
	Download a file to disk using the filename from the URL

	Returns the filename of the downloaded file
	'''
	outputFile = getLastPathSegment(url)

	logging.info('Saving file %s as %s' % (url, outputFile))

	req = requests.get(url, allow_redirects=True)
	open(outputFile, 'wb').write(req.content)

	return outputFile

	def writeFrontMattter(handle, data):
	handle.write('---\n')

	for key in data.keys():
	handle.write('%s: %s\n' % (key, data[key]))

	handle.write('---\n\n')

	def savePost(url, force=False, keepHtml=True):
	'''
	Download a hackaday.io post and all of its images

	Saves files to the current working directory
	'''

	source = fetchArticle(url)

	# Strip article ID for markdown filename
	outputName = re.sub(r'^\d+-', '', source['metadata']['slug'])


	articlePath = outputName + '.md'
	htmlPath = '_' + outputName + '.original.html'

	if not force and os.path.exists(articlePath):
	logging.info('Output file "%s" for url %s already exists!' % (articlePath, url))
	logging.info('Refusing to overwrite existing file without --force')
	return

	content = source['content']

	# Find and download images in the content
	# Once downloaded the URL is replaced with a relative path to the file on disk
	for image in content.find_all('img'):
	image.attrs['src'] = downloadFile( findImageUrl(image) )

	title = '# %s\n\n' % source['title']
	htmlStr = content.encode(formatter='html5').decode('utf-8')
	markdown = htmlToMd(htmlStr)

	with open(articlePath, mode='w', encoding='utf-8') as handle:
	writeFrontMattter(handle, source['metadata'])

	handle.write(title)
	handle.write(markdown)

	# Output original HTML with image paths changed
	if keepHtml:
	with open(htmlPath, mode='w', encoding='utf-8') as handle:
	handle.write(htmlStr)


	if __name__ == '__main__':
	import argparse

	# Enable logging output
	logging.basicConfig(stream=sys.stdout, level=logging.WARNING)

	parser = argparse.ArgumentParser(description='Convert a hackaday.io post to markdown')
	parser.add_argument('url', nargs='+', help='Post URL')
	parser.add_argument('--force', '-f', action='store_true', help='Overwrite if the file already exists')
	parser.add_argument('--auto-dir', '-d', action='store_true', help='Place in a directory based on the URL')
	args = parser.parse_args()

	for url in args.url:

	# Sanity check URL we've been given
	if "https://hackaday.io" not in url:
	logging.warning('This does not look like a hackaday.io URL: ' + url)

	if args.auto_dir:
	# Get slug without article ID
	slug = re.sub(r'^\d+-', '', getLastPathSegment(url))

	# Find current highest local dir number
	localId = 0
	prefix = r'^(\d+)-'
	for name in os.listdir():
	if re.match(prefix, name):
	idPrefix = re.findall(prefix, name)[0]
	localId = max(localId, int(idPrefix))

	# Increment to get the next local dir number
	localId += 1

	# Create a dir in sequence from slug
	dirname = '%03d-%s' % (localId, slug)

	print("Downloading to dir: " + dirname)
	os.mkdir(dirname)
	os.chdir(dirname)

	savePost(url, force=args.force)

	if args.auto_dir:
	os.chdir('..')