Skip to content

Instantly share code, notes, and snippets.

@trevormunoz
Created February 28, 2014 20:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save trevormunoz/9278681 to your computer and use it in GitHub Desktop.
Save trevormunoz/9278681 to your computer and use it in GitHub Desktop.
Documenting script used to batch convert HTML articles from DH Curation Guide to Markdown
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "DH Curation Guide - Content Conversion"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": "import os",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": "!ls .",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "DH Curation Guide - Content Conversion.ipynb\r\n\u001b[1m\u001b[36mabout\u001b[m\u001b[m\r\nabout.1.html\r\n\u001b[1m\u001b[36mcollections\u001b[m\u001b[m\r\n\u001b[1m\u001b[36mcontents\u001b[m\u001b[m\r\n\u001b[1m\u001b[36meditors\u001b[m\u001b[m\r\neditors.1.html\r\n\u001b[1m\u001b[36mfaq\u001b[m\u001b[m\r\nglossary.html\r\n\u001b[1m\u001b[36mimages\u001b[m\u001b[m\r\n\u001b[1m\u001b[36mincludes\u001b[m\u001b[m\r\nindex.html\r\n\u001b[1m\u001b[36mintro\u001b[m\u001b[m\r\nintro.1.html\r\n\u001b[1m\u001b[36mlegal\u001b[m\u001b[m\r\n\u001b[1m\u001b[36mmetadata\u001b[m\u001b[m\r\n\u001b[1m\u001b[36mplanning\u001b[m\u001b[m\r\n\u001b[1m\u001b[36mpreservation\u001b[m\u001b[m\r\n\u001b[1m\u001b[36mprivacy\u001b[m\u001b[m\r\n\u001b[1m\u001b[36mrepositories\u001b[m\u001b[m\r\n\u001b[1m\u001b[36mrepresentation\u001b[m\u001b[m\r\nrepresentation.1.html\r\n\u001b[1m\u001b[36mresearch-practices\u001b[m\u001b[m\r\nrobots.txt\r\nsitemap.html\r\n\u001b[1m\u001b[36mstorage\u001b[m\u001b[m\r\n\u001b[1m\u001b[36msubmissions\u001b[m\u001b[m\r\n"
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": "downloaded_data = os.curdir",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": "files = [f for f in os.listdir(downloaded_data) if os.path.isfile(f)]",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": "files",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 5,
"text": "['.DS_Store',\n 'about.1.html',\n 'DH Curation Guide - Content Conversion.ipynb',\n 'editors.1.html',\n 'glossary.html',\n 'index.html',\n 'intro.1.html',\n 'representation.1.html',\n 'robots.txt',\n 'sitemap.html']"
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": "folders = [fld for fld in os.listdir(downloaded_data) if os.path.isdir(fld)]",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": "folders",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 7,
"text": "['about',\n 'collections',\n 'contents',\n 'editors',\n 'faq',\n 'images',\n 'includes',\n 'intro',\n 'legal',\n 'metadata',\n 'planning',\n 'preservation',\n 'privacy',\n 'repositories',\n 'representation',\n 'research-practices',\n 'storage',\n 'submissions']"
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": "pages = []\nfor folder in folders:\n for dirpath, dirnames, filenames in os.walk(os.path.join(os.path.abspath(downloaded_data), folder)):\n if dirnames == []:\n pages.extend([os.path.join(dirpath, f) for f in filenames])\n elif len(dirnames) == 1 and len(filenames) < 2:\n pages.append(os.path.join(dirpath, dirnames[0], filenames[0]))",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": "pages.append(os.path.join(os.path.abspath(downloaded_data), 'index.html'))\npages.append(os.path.join(os.path.abspath(downloaded_data), 'glossary.html'))",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": "len(pages)",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 10,
"text": "44"
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": "pages",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 11,
"text": "['/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/about/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/collections/standards/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/collections/standards/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/contents/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/editors/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/faq/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/about.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/classics.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/collections.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/data-rep.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/data-sec-coming-soon.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/editors.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/faq.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/fedora-coming-soon.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/glossary.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/history-coming-soon.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/intro.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/metadata-stub.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/planning-coming-soon.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/policy.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/preservation-stub.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/repositories-coming-soon.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/research.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/standards.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/includes/jamieslib.js',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/includes/jquery.cookie.js',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/includes/print.css',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/includes/style.css',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/intro/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/legal/policy/policy.1.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/legal/policy/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/metadata/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/planning/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/preservation/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/privacy/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/repositories/fedora/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/repositories/fedora/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/representation/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/research-practices/classics/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/research-practices/history/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/storage/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/submissions/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/glossary.html']"
}
],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": "def filter_for_html(obj):\n if obj.endswith('.html'):\n return True\n else:\n return False",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": "content = filter(filter_for_html, pages)",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": "len(content)",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 14,
"text": "22"
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": "content",
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 15,
"text": "['/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/about/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/collections/standards/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/collections/standards/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/contents/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/editors/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/faq/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/intro/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/legal/policy/policy.1.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/legal/policy/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/metadata/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/planning/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/preservation/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/privacy/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/repositories/fedora/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/repositories/fedora/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/representation/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/research-practices/classics/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/research-practices/history/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/storage/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/submissions/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/glossary.html']"
}
],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": "output_dir = \"/Users/libraries/Downloads/dhcuration-guide-markdown\"",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": "import subprocess\n\nfor c in content:\n rename = '-'.join(c.rsplit('/', 2)[-2:])\n new_filename = rename.split('.')[0] + '.md'\n subprocess.call(['pandoc', '-f', 'html', '-t', 'markdown', c, '-o', os.path.join(output_dir, new_filename)])",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 17
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment