Skip to content

Instantly share code, notes, and snippets.

@jasongzy
Last active February 22, 2019 16:35
Show Gist options
  • Save jasongzy/9b6ae9a1732cfadf97672d7c55cbbb98 to your computer and use it in GitHub Desktop.
Save jasongzy/9b6ae9a1732cfadf97672d7c55cbbb98 to your computer and use it in GitHub Desktop.
This uses the Mercury parser (https://mercury.postlight.com/web-parser/) and other tools to convert an article from URL to an HTML file. Image URLs are converted into inline Base-64 encoded images. You can run the command with the following code: ./mercury_parse.py --url=<URL> --htmlfile=<htmlfile>. <url> has to exist, and <htmlfile> has to end …
#!/usr/bin/env python
import os, sys, requests, base64, urllib.request, codecs
from bs4 import BeautifulSoup
from optparse import OptionParser
_apiKey = 'nGc0ya2J7z2aalFrGa8Gx3Q1o8grGFsn3cz58EJy'
def get_content_to_file( url, htmlfilename ):
with requests.Session( ) as s:
s.headers = { 'Content-Type' : 'application/json',
'x-api-key' : _apiKey }
response = s.get( 'https://mercury.postlight.com/parser',
params = { 'url' : url })
if response.status_code != 200:
return 'Error, no data from %s' % url
data = response.json( )
content = data['content']
title = data['title']
date_publish_string = data['date_published']
excerpt = data['excerpt']
url = data['url']
html = BeautifulSoup( content, 'lxml' )
#
## now all png objects to inline
for img in html.find_all('img'):
imgURL = img['src']
if imgURL.lower( ).endswith('.png'):
img_64 = "data:image/png;base64," + str(base64.b64encode( urllib.request.urlopen( imgURL ).read( ) ),'utf8')
elif imgURL.lower( ).endswith( '.jpg' ):
img_64 = "data:image/jpg;base64," + str(base64.b64encode( urllib.request.urlopen( imgURL ).read( ) ),'utf8')
else:
img_64 = None
#
if img_64 is not None:
img['src'] = img_64
htag = html.new_tag( 'head' )
mtag = html.new_tag( 'meta' )
mtag['charset'] = 'utf-8'
htag.append( mtag )
html.insert(0, htag )
with codecs.open( htmlfilename, 'w', 'utf-8') as openfile:
openfile.write('%s\n' % html.prettify( ) )
if __name__=='__main__':
parser = OptionParser( )
parser.add_option('--url', dest='url', type=str, action='store',
help = "Name of the URL to output into an HTML file.")
parser.add_option('--htmlfile', dest='htmlfile', type=str, action='store',
help = 'Name of the HTML file to store the underlying data.')
opts, args = parser.parse_args( )
assert(all(map(lambda tok: tok is not None, ( opts.url, opts.htmlfile ) ) ) )
assert( os.path.basename( opts.htmlfile ).endswith( '.html' ) )
get_content_to_file( opts.url, opts.htmlfile )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment