Created
February 28, 2014 20:00
-
-
Save trevormunoz/9278681 to your computer and use it in GitHub Desktop.
Documenting script used to batch convert HTML articles from DH Curation Guide to Markdown
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "DH Curation Guide - Content Conversion" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "import os", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "!ls .", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "DH Curation Guide - Content Conversion.ipynb\r\n\u001b[1m\u001b[36mabout\u001b[m\u001b[m\r\nabout.1.html\r\n\u001b[1m\u001b[36mcollections\u001b[m\u001b[m\r\n\u001b[1m\u001b[36mcontents\u001b[m\u001b[m\r\n\u001b[1m\u001b[36meditors\u001b[m\u001b[m\r\neditors.1.html\r\n\u001b[1m\u001b[36mfaq\u001b[m\u001b[m\r\nglossary.html\r\n\u001b[1m\u001b[36mimages\u001b[m\u001b[m\r\n\u001b[1m\u001b[36mincludes\u001b[m\u001b[m\r\nindex.html\r\n\u001b[1m\u001b[36mintro\u001b[m\u001b[m\r\nintro.1.html\r\n\u001b[1m\u001b[36mlegal\u001b[m\u001b[m\r\n\u001b[1m\u001b[36mmetadata\u001b[m\u001b[m\r\n\u001b[1m\u001b[36mplanning\u001b[m\u001b[m\r\n\u001b[1m\u001b[36mpreservation\u001b[m\u001b[m\r\n\u001b[1m\u001b[36mprivacy\u001b[m\u001b[m\r\n\u001b[1m\u001b[36mrepositories\u001b[m\u001b[m\r\n\u001b[1m\u001b[36mrepresentation\u001b[m\u001b[m\r\nrepresentation.1.html\r\n\u001b[1m\u001b[36mresearch-practices\u001b[m\u001b[m\r\nrobots.txt\r\nsitemap.html\r\n\u001b[1m\u001b[36mstorage\u001b[m\u001b[m\r\n\u001b[1m\u001b[36msubmissions\u001b[m\u001b[m\r\n" | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "downloaded_data = os.curdir", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "files = [f for f in os.listdir(downloaded_data) if os.path.isfile(f)]", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "files", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 5, | |
"text": "['.DS_Store',\n 'about.1.html',\n 'DH Curation Guide - Content Conversion.ipynb',\n 'editors.1.html',\n 'glossary.html',\n 'index.html',\n 'intro.1.html',\n 'representation.1.html',\n 'robots.txt',\n 'sitemap.html']" | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "folders = [fld for fld in os.listdir(downloaded_data) if os.path.isdir(fld)]", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "folders", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 7, | |
"text": "['about',\n 'collections',\n 'contents',\n 'editors',\n 'faq',\n 'images',\n 'includes',\n 'intro',\n 'legal',\n 'metadata',\n 'planning',\n 'preservation',\n 'privacy',\n 'repositories',\n 'representation',\n 'research-practices',\n 'storage',\n 'submissions']" | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "pages = []\nfor folder in folders:\n for dirpath, dirnames, filenames in os.walk(os.path.join(os.path.abspath(downloaded_data), folder)):\n if dirnames == []:\n pages.extend([os.path.join(dirpath, f) for f in filenames])\n elif len(dirnames) == 1 and len(filenames) < 2:\n pages.append(os.path.join(dirpath, dirnames[0], filenames[0]))", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "pages.append(os.path.join(os.path.abspath(downloaded_data), 'index.html'))\npages.append(os.path.join(os.path.abspath(downloaded_data), 'glossary.html'))", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "len(pages)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 10, | |
"text": "44" | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "pages", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 11, | |
"text": "['/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/about/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/collections/standards/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/collections/standards/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/contents/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/editors/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/faq/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/about.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/classics.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/collections.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/data-rep.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/data-sec-coming-soon.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/editors.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/faq.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/fedora-coming-soon.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/glossary.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/history-coming-soon.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/intro.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/metadata-stub.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/planning-coming-soon.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/policy.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/preservation-stub.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/repositories-coming-soon.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/research.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/images/grid/standards.jpg',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/includes/jamieslib.js',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/includes/jquery.cookie.js',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/includes/print.css',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/includes/style.css',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/intro/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/legal/policy/policy.1.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/legal/policy/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/metadata/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/planning/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/preservation/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/privacy/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/repositories/fedora/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/repositories/fedora/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/representation/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/research-practices/classics/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/research-practices/history/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/storage/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/submissions/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/glossary.html']" | |
} | |
], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "def filter_for_html(obj):\n if obj.endswith('.html'):\n return True\n else:\n return False", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "content = filter(filter_for_html, pages)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "len(content)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 14, | |
"text": "22" | |
} | |
], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "content", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"metadata": {}, | |
"output_type": "pyout", | |
"prompt_number": 15, | |
"text": "['/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/about/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/collections/standards/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/collections/standards/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/contents/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/editors/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/faq/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/intro/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/legal/policy/policy.1.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/legal/policy/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/metadata/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/planning/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/preservation/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/privacy/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/repositories/fedora/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/repositories/fedora/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/representation/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/research-practices/classics/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/research-practices/history/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/storage/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/submissions/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/index.html',\n '/Users/libraries/Downloads/dhcuration-crawl/guide.dhcuration.org/glossary.html']" | |
} | |
], | |
"prompt_number": 15 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "output_dir = \"/Users/libraries/Downloads/dhcuration-guide-markdown\"", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 16 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "import subprocess\n\nfor c in content:\n rename = '-'.join(c.rsplit('/', 2)[-2:])\n new_filename = rename.split('.')[0] + '.md'\n subprocess.call(['pandoc', '-f', 'html', '-t', 'markdown', c, '-o', os.path.join(output_dir, new_filename)])", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 17 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment