Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save ColinTalbert/5ef8e36b4fdb3be713f649730a4c1858 to your computer and use it in GitHub Desktop.
Save ColinTalbert/5ef8e36b4fdb3be713f649730a4c1858 to your computer and use it in GitHub Desktop.
Convert FGDC XMLs to Formatted Word docx.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## An example of how to batch convert FGDC XML documents to a rendered Word format (docx)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from docx import Document\n",
" \n",
"from pymdwizard.core import review_utils\n",
"from docx.shared import Inches\n",
"from pymdwizard.core.review_utils import _get_longname, _add_child_content\n",
"from pymdwizard.core import xml_utils\n",
"\n",
"import glob"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def md_to_docx(xml_fname, docx_fname):\n",
" document = Document()\n",
"\n",
" DOCX = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'\n",
" element = document.settings.element.find(DOCX + 'proofState')\n",
" element.attrib[DOCX + 'grammar'] = 'dirty'\n",
" element.attrib[DOCX + 'spelling'] = 'dirty'\n",
"\n",
" review_utils._load_styles(document)\n",
" \n",
" md = xml_utils.XMLRecord(xml_fname)\n",
" title2 = document.add_heading('Metadata:', level=3)\n",
" title2.style = document.styles['fgdc heading 3']\n",
"\n",
" for child in md.metadata.children:\n",
" long_name = _get_longname(child.tag)\n",
" link = document.add_paragraph(text=long_name, style='fgdc link')\n",
" link.paragraph_format.left_indent = Inches(0.5)\n",
" link.paragraph_format.line_spacing=1\n",
"\n",
" for child in md.metadata.children:\n",
" long_name = _get_longname(child.tag)\n",
" bar = document.add_paragraph('_'*72, style='fgdc bar')\n",
"\n",
" section_title = document.add_heading(long_name+ ':', level=3)\n",
" section_title.style = document.styles['fgdc heading 3']\n",
" section_title.paragraph_format.space_after = Inches(.15)\n",
" _add_child_content(document, child)\n",
"\n",
" document.save(docx_fname)\n",
"\n",
"# if you would like to test thies function on a single file uncomment and update these lines \n",
"# fname = r\"C:\\Users\\talbertc\\Downloads\\MD_AGES_and FREQUENCIES.csv.xml\"\n",
"# md_to_docx(fname, fname+'.docx')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Put in your directory name here"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dname = r\"C:\\temp\\WizardScratch\"\n",
"\n",
"md_fnames = list(glob.iglob('{}/**/*.xml'.format(dname), recursive=True))\n",
"md_fnames = [f for f in md_fnames if '~' not in f]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"processing: C:\\temp\\WizardScratch\\Bathymetric_and_hydrodynamic_surveys_of_Lake_Michigan_nearshore_near_Ogden_Dunes_Burns_Harbor_Indiana.xml\n",
"\n",
"processing: C:\\temp\\WizardScratch\\Bathymetric_surveys_of_the_Neosho_River_Spring_River_and_Elk_River_northeastern_Oklahoma_southwestern_Missouri_2016-17.xml\n",
"\n",
"processing: C:\\temp\\WizardScratch\\Bathymetric_survey_of_Rock_Run_Rookery_Lake_Will_County_Illinois_FINAL.xml\n",
"\n",
"processing: C:\\temp\\WizardScratch\\Bathymetry_and_Capacity_of_Shawnee_Reservoir_Oklahoma_2016.xml\n",
"\n",
"processing: C:\\temp\\WizardScratch\\Bathymetry_on_the_East_Fork_White_River_at_Columbus_Indiana_March_29-30_and_April_13_2017.xml\n",
"\n",
"processing: C:\\temp\\WizardScratch\\Bathymetry_on_the_Rolling_Fork_and_Beech_Fork_near_Boston_Kentucky_April_4-5_2017.xml\n",
"\n"
]
}
],
"source": [
"for fname in md_fnames:\n",
" print(\"processing: {}\\n\".format(fname))\n",
" md_to_docx(fname, fname+'.docx')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"gist": {
"data": {
"description": "Convert FGDC XMLs to Formatted Word docx.ipynb",
"public": true
},
"id": ""
},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
},
"toc": {
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment