stringertheory/goog2md.py

## goog2md.py
"""Convert a google doc to markdown with all of the cruft removed.

"""
import hashlib
import imghdr
import shutil
import subprocess
import sys
import urllib
import urlparse

import bs4
import requests

DEFAULT_FLAVOR = 'markdown_github'


def download_image(url, filename):
    urllib.urlretrieve(url, filename)
    extension = imghdr.what(filename)
    new_filename = filename
    if not filename.endswith(extension):
        new_filename = '{}.{}'.format(filename, extension)
        shutil.move(filename, new_filename)
    return new_filename


def get_html(url):
    response = requests.get(url)
    return response.text


def convert_to_markdown(html, flavor=DEFAULT_FLAVOR):
    """http://pandoc.org/MANUAL.html"""
    command = (
        'pandoc --smart --wrap=none --atx-headers'
        ' -f html -t {}-raw_html-native_divs-'
        'native_spans-header_attributes-link_attributes'
    ).format(flavor)
    process = subprocess.Popen(
        command,
        shell=True,
        stdout=subprocess.PIPE,
        stdin=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    stdout_data, stderr_data = process.communicate(input=html.encode('utf-8'))
    print >> sys.stderr, stderr_data
    return stdout_data


def degooglify_url(url):
    result = url
    if 'google' in url:
        parsed = urlparse.urlparse(url)
        query_parameters = urlparse.parse_qs(parsed.query)
        query = query_parameters.get('q')
        if query:
            result = query[0]
    return result


def clean_html(html):

    soup = bs4.BeautifulSoup(html, 'lxml')

    footer = soup.find('div', {'id': 'footer'})
    if footer:
        footer.decompose()

    for a_tag in soup.find_all('a'):
        a_tag['href'] = degooglify_url(a_tag['href'])

    for img in soup.find_all('img'):
        if img['alt']:
            filename = 'image-{}'.format(img['alt'])
        else:
            filename = 'image-{}'.format(hashlib.md5(img['src']).hexdigest())
        new_filename = download_image(img['src'], filename)
        img['src'] = new_filename

    return unicode(soup)


def main(url):
    html = get_html(url)
    cleaned = clean_html(html)
    markdown = convert_to_markdown(cleaned)
    print markdown

if __name__ == '__main__':

    googledoc_url = sys.argv[1]
    main(googledoc_url)
	"""Convert a google doc to markdown with all of the cruft removed.

	"""
	import hashlib
	import imghdr
	import shutil
	import subprocess
	import sys
	import urllib
	import urlparse

	import bs4
	import requests

	DEFAULT_FLAVOR = 'markdown_github'


	def download_image(url, filename):
	urllib.urlretrieve(url, filename)
	extension = imghdr.what(filename)
	new_filename = filename
	if not filename.endswith(extension):
	new_filename = '{}.{}'.format(filename, extension)
	shutil.move(filename, new_filename)
	return new_filename


	def get_html(url):
	response = requests.get(url)
	return response.text


	def convert_to_markdown(html, flavor=DEFAULT_FLAVOR):
	"""http://pandoc.org/MANUAL.html"""
	command = (
	'pandoc --smart --wrap=none --atx-headers'
	' -f html -t {}-raw_html-native_divs-'
	'native_spans-header_attributes-link_attributes'
	).format(flavor)
	process = subprocess.Popen(
	command,
	shell=True,
	stdout=subprocess.PIPE,
	stdin=subprocess.PIPE,
	stderr=subprocess.PIPE,
	)
	stdout_data, stderr_data = process.communicate(input=html.encode('utf-8'))
	print >> sys.stderr, stderr_data
	return stdout_data


	def degooglify_url(url):
	result = url
	if 'google' in url:
	parsed = urlparse.urlparse(url)
	query_parameters = urlparse.parse_qs(parsed.query)
	query = query_parameters.get('q')
	if query:
	result = query[0]
	return result


	def clean_html(html):

	soup = bs4.BeautifulSoup(html, 'lxml')

	footer = soup.find('div', {'id': 'footer'})
	if footer:
	footer.decompose()

	for a_tag in soup.find_all('a'):
	a_tag['href'] = degooglify_url(a_tag['href'])

	for img in soup.find_all('img'):
	if img['alt']:
	filename = 'image-{}'.format(img['alt'])
	else:
	filename = 'image-{}'.format(hashlib.md5(img['src']).hexdigest())
	new_filename = download_image(img['src'], filename)
	img['src'] = new_filename

	return unicode(soup)


	def main(url):
	html = get_html(url)
	cleaned = clean_html(html)
	markdown = convert_to_markdown(cleaned)
	print markdown

	if __name__ == '__main__':

	googledoc_url = sys.argv[1]
	main(googledoc_url)