Skip to content

Instantly share code, notes, and snippets.

@leonoverweel
Last active September 11, 2019 20:28
Show Gist options
  • Save leonoverweel/6500c304d018b4805af0a1d880408c5c to your computer and use it in GitHub Desktop.
Save leonoverweel/6500c304d018b4805af0a1d880408c5c to your computer and use it in GitHub Desktop.
Convert a Revue newsletter issue to markdown; currently supported content types: title, headers, paragraphs, lists, blockquotes, and images.
import sys
from urllib import request
from bs4 import BeautifulSoup
import html2text
# CSS classes used by Revue
CLS_BLOCKQUOTE = 'revue-blockquote'
CLS_H2 = 'header-text'
CLS_IMG = 'img'
CLS_P = 'revue-p'
CLS_UL = 'ul'
def transform_element(html_element):
"""Transform an HTML element from Revue into markdown text and return it."""
cls = html_element['class'][0]
# Skip empty elements
if html_element.text == '' and cls != CLS_IMG:
return ''
converter = html2text.HTML2Text()
converter.body_width = 0
# Blockquotes
if cls == CLS_BLOCKQUOTE:
text = converter.handle(str(html_element)).strip()
# Make text one sentence per line
text = text.replace('. ', '.\n> ')
text = text.replace('! ', '!\n> ')
text = text.replace('? ', '?\n> ')
# Headers
elif cls == CLS_H2:
text = f'## {html_element.text.strip()}'
# Images
elif cls == CLS_IMG:
url = html_element.attrs["src"]
alt = html_element.attrs["alt"]
text = f'![{alt}]({url})\n_{alt}_'
# Paragraphs
elif cls == CLS_P:
text = converter.handle(str(html_element))
# Make text one sentence per line
text = text.replace('.** ', '.**\n')
text = text.replace('!** ', '!**\n')
text = text.replace('?** ', '?**\n')
text = text.replace('. ', '.\n')
text = text.replace('! ', '!\n')
text = text.replace('? ', '?\n')
# Lists
elif cls == CLS_UL:
text = converter.handle(str(html_element))
# Remove indent and use -s instead of *s
text = text.replace(' * ', '* ')
text = text.replace('* ', '- ')
else:
raise ValueError('Unimplemented class')
return f'{text.strip()}\n\n'
def load_issue(issue_id, base_url='https://dynamicallytyped.com'):
"""Download an issue and return its HTML contents."""
url = f'{base_url}/issues/0-{issue_id}'
req = request.Request(url, headers={'User-Agent': 'Totally a real browser and not a bot, yep'})
return request.urlopen(req).read().decode('utf-8')
def revue_to_md(issue_id):
html_doc = load_issue(issue_id)
soup = BeautifulSoup(html_doc, 'html.parser')
# Clean content to make it ready for transformer
quotes = soup.find_all(class_=CLS_BLOCKQUOTE) # html2text needs semantic HTML for blockquotes
for tag in quotes:
tag.name = 'blockquote'
lists = soup.find_all(name='ul') # only selecting on classes so we add class="ul" to <ul>s
for tag in lists:
tag.attrs['class'] = [CLS_UL]
images = soup.find_all('img', width='600')
for tag in images:
if tag.attrs['alt'] != 'Dynamically Typed':
tag.attrs['class'] = [CLS_IMG]
# Extract relevant content
content = soup.find_all(class_=lambda cls: cls in [CLS_BLOCKQUOTE, CLS_H2, CLS_IMG, CLS_P, CLS_UL])
# Transform content
title = soup.title.text.split('|')[0]
markdown = f'# {title}\n\n'
markdown += ''.join(transform_element(tag) for tag in content).strip()
return markdown
if __name__ == '__main__':
if len(sys.argv) != 2:
print('Usage: `python3 revue_to_md.py issue_id`, where the latter is the 6-digit ID in the URL.')
exit()
markdown = revue_to_md(sys.argv[1])
print(markdown)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment