Skip to content

Instantly share code, notes, and snippets.

@stringertheory
Last active September 17, 2016 05:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stringertheory/3019e4caa785e61b17bdf2b9f6d1837c to your computer and use it in GitHub Desktop.
Save stringertheory/3019e4caa785e61b17bdf2b9f6d1837c to your computer and use it in GitHub Desktop.
hack to get markdown, dammit
"""Convert a google doc to markdown with all of the cruft removed.
"""
import hashlib
import imghdr
import shutil
import subprocess
import sys
import urllib
import urlparse
import bs4
import requests
DEFAULT_FLAVOR = 'markdown_github'
def download_image(url, filename):
urllib.urlretrieve(url, filename)
extension = imghdr.what(filename)
new_filename = filename
if not filename.endswith(extension):
new_filename = '{}.{}'.format(filename, extension)
shutil.move(filename, new_filename)
return new_filename
def get_html(url):
response = requests.get(url)
return response.text
def convert_to_markdown(html, flavor=DEFAULT_FLAVOR):
"""http://pandoc.org/MANUAL.html"""
command = (
'pandoc --smart --wrap=none --atx-headers'
' -f html -t {}-raw_html-native_divs-'
'native_spans-header_attributes-link_attributes'
).format(flavor)
process = subprocess.Popen(
command,
shell=True,
stdout=subprocess.PIPE,
stdin=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout_data, stderr_data = process.communicate(input=html.encode('utf-8'))
print >> sys.stderr, stderr_data
return stdout_data
def degooglify_url(url):
result = url
if 'google' in url:
parsed = urlparse.urlparse(url)
query_parameters = urlparse.parse_qs(parsed.query)
query = query_parameters.get('q')
if query:
result = query[0]
return result
def clean_html(html):
soup = bs4.BeautifulSoup(html, 'lxml')
footer = soup.find('div', {'id': 'footer'})
if footer:
footer.decompose()
for a_tag in soup.find_all('a'):
a_tag['href'] = degooglify_url(a_tag['href'])
for img in soup.find_all('img'):
if img['alt']:
filename = 'image-{}'.format(img['alt'])
else:
filename = 'image-{}'.format(hashlib.md5(img['src']).hexdigest())
new_filename = download_image(img['src'], filename)
img['src'] = new_filename
return unicode(soup)
def main(url):
html = get_html(url)
cleaned = clean_html(html)
markdown = convert_to_markdown(cleaned)
print markdown
if __name__ == '__main__':
googledoc_url = sys.argv[1]
main(googledoc_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment