ColinTalbert/Convert FGDC XMLs to Formatted Word docx.ipynb

## Convert FGDC XMLs to Formatted Word docx.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## An example of how to batch convert FGDC XML documents to a rendered Word format (docx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from docx import Document\n",
    "    \n",
    "from pymdwizard.core import review_utils\n",
    "from docx.shared import Inches\n",
    "from pymdwizard.core.review_utils import _get_longname, _add_child_content\n",
    "from pymdwizard.core import xml_utils\n",
    "\n",
    "import glob"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def md_to_docx(xml_fname, docx_fname):\n",
    "    document = Document()\n",
    "\n",
    "    DOCX = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'\n",
    "    element = document.settings.element.find(DOCX + 'proofState')\n",
    "    element.attrib[DOCX + 'grammar'] = 'dirty'\n",
    "    element.attrib[DOCX + 'spelling'] = 'dirty'\n",
    "\n",
    "    review_utils._load_styles(document)\n",
    "    \n",
    "    md = xml_utils.XMLRecord(xml_fname)\n",
    "    title2 = document.add_heading('Metadata:', level=3)\n",
    "    title2.style = document.styles['fgdc heading 3']\n",
    "\n",
    "    for child in md.metadata.children:\n",
    "        long_name = _get_longname(child.tag)\n",
    "        link = document.add_paragraph(text=long_name, style='fgdc link')\n",
    "        link.paragraph_format.left_indent = Inches(0.5)\n",
    "        link.paragraph_format.line_spacing=1\n",
    "\n",
    "    for child in md.metadata.children:\n",
    "        long_name = _get_longname(child.tag)\n",
    "        bar = document.add_paragraph('_'*72, style='fgdc bar')\n",
    "\n",
    "        section_title = document.add_heading(long_name+ ':', level=3)\n",
    "        section_title.style = document.styles['fgdc heading 3']\n",
    "        section_title.paragraph_format.space_after = Inches(.15)\n",
    "        _add_child_content(document, child)\n",
    "\n",
    "    document.save(docx_fname)\n",
    "\n",
    "# if you would like to test thies function on a single file uncomment and update these lines    \n",
    "# fname = r\"C:\\Users\\talbertc\\Downloads\\MD_AGES_and FREQUENCIES.csv.xml\"\n",
    "# md_to_docx(fname, fname+'.docx')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Put in your directory name here"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dname = r\"C:\\temp\\WizardScratch\"\n",
    "\n",
    "md_fnames = list(glob.iglob('{}/**/*.xml'.format(dname), recursive=True))\n",
    "md_fnames = [f for f in md_fnames if '~' not in f]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "processing: C:\\temp\\WizardScratch\\Bathymetric_and_hydrodynamic_surveys_of_Lake_Michigan_nearshore_near_Ogden_Dunes_Burns_Harbor_Indiana.xml\n",
      "\n",
      "processing: C:\\temp\\WizardScratch\\Bathymetric_surveys_of_the_Neosho_River_Spring_River_and_Elk_River_northeastern_Oklahoma_southwestern_Missouri_2016-17.xml\n",
      "\n",
      "processing: C:\\temp\\WizardScratch\\Bathymetric_survey_of_Rock_Run_Rookery_Lake_Will_County_Illinois_FINAL.xml\n",
      "\n",
      "processing: C:\\temp\\WizardScratch\\Bathymetry_and_Capacity_of_Shawnee_Reservoir_Oklahoma_2016.xml\n",
      "\n",
      "processing: C:\\temp\\WizardScratch\\Bathymetry_on_the_East_Fork_White_River_at_Columbus_Indiana_March_29-30_and_April_13_2017.xml\n",
      "\n",
      "processing: C:\\temp\\WizardScratch\\Bathymetry_on_the_Rolling_Fork_and_Beech_Fork_near_Boston_Kentucky_April_4-5_2017.xml\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for fname in md_fnames:\n",
    "    print(\"processing: {}\\n\".format(fname))\n",
    "    md_to_docx(fname, fname+'.docx')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "gist": {
   "data": {
    "description": "Convert FGDC XMLs to Formatted Word docx.ipynb",
    "public": true
   },
   "id": ""
  },
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  },
  "toc": {
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## An example of how to batch convert FGDC XML documents to a rendered Word format (docx)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"from docx import Document\n",
	" \n",
	"from pymdwizard.core import review_utils\n",
	"from docx.shared import Inches\n",
	"from pymdwizard.core.review_utils import _get_longname, _add_child_content\n",
	"from pymdwizard.core import xml_utils\n",
	"\n",
	"import glob"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [],
	"source": [
	"def md_to_docx(xml_fname, docx_fname):\n",
	" document = Document()\n",
	"\n",
	" DOCX = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'\n",
	" element = document.settings.element.find(DOCX + 'proofState')\n",
	" element.attrib[DOCX + 'grammar'] = 'dirty'\n",
	" element.attrib[DOCX + 'spelling'] = 'dirty'\n",
	"\n",
	" review_utils._load_styles(document)\n",
	" \n",
	" md = xml_utils.XMLRecord(xml_fname)\n",
	" title2 = document.add_heading('Metadata:', level=3)\n",
	" title2.style = document.styles['fgdc heading 3']\n",
	"\n",
	" for child in md.metadata.children:\n",
	" long_name = _get_longname(child.tag)\n",
	" link = document.add_paragraph(text=long_name, style='fgdc link')\n",
	" link.paragraph_format.left_indent = Inches(0.5)\n",
	" link.paragraph_format.line_spacing=1\n",
	"\n",
	" for child in md.metadata.children:\n",
	" long_name = _get_longname(child.tag)\n",
	" bar = document.add_paragraph('_'*72, style='fgdc bar')\n",
	"\n",
	" section_title = document.add_heading(long_name+ ':', level=3)\n",
	" section_title.style = document.styles['fgdc heading 3']\n",
	" section_title.paragraph_format.space_after = Inches(.15)\n",
	" _add_child_content(document, child)\n",
	"\n",
	" document.save(docx_fname)\n",
	"\n",
	"# if you would like to test thies function on a single file uncomment and update these lines \n",
	"# fname = r\"C:\\Users\\talbertc\\Downloads\\MD_AGES_and FREQUENCIES.csv.xml\"\n",
	"# md_to_docx(fname, fname+'.docx')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Put in your directory name here"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"dname = r\"C:\\temp\\WizardScratch\"\n",
	"\n",
	"md_fnames = list(glob.iglob('{}/*/.xml'.format(dname), recursive=True))\n",
	"md_fnames = [f for f in md_fnames if '~' not in f]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"processing: C:\\temp\\WizardScratch\\Bathymetric_and_hydrodynamic_surveys_of_Lake_Michigan_nearshore_near_Ogden_Dunes_Burns_Harbor_Indiana.xml\n",
	"\n",
	"processing: C:\\temp\\WizardScratch\\Bathymetric_surveys_of_the_Neosho_River_Spring_River_and_Elk_River_northeastern_Oklahoma_southwestern_Missouri_2016-17.xml\n",
	"\n",
	"processing: C:\\temp\\WizardScratch\\Bathymetric_survey_of_Rock_Run_Rookery_Lake_Will_County_Illinois_FINAL.xml\n",
	"\n",
	"processing: C:\\temp\\WizardScratch\\Bathymetry_and_Capacity_of_Shawnee_Reservoir_Oklahoma_2016.xml\n",
	"\n",
	"processing: C:\\temp\\WizardScratch\\Bathymetry_on_the_East_Fork_White_River_at_Columbus_Indiana_March_29-30_and_April_13_2017.xml\n",
	"\n",
	"processing: C:\\temp\\WizardScratch\\Bathymetry_on_the_Rolling_Fork_and_Beech_Fork_near_Boston_Kentucky_April_4-5_2017.xml\n",
	"\n"
	]
	}
	],
	"source": [
	"for fname in md_fnames:\n",
	" print(\"processing: {}\\n\".format(fname))\n",
	" md_to_docx(fname, fname+'.docx')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"gist": {
	"data": {
	"description": "Convert FGDC XMLs to Formatted Word docx.ipynb",
	"public": true
	},
	"id": ""
	},
	"kernelspec": {
	"display_name": "Python [default]",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.3"
	},
	"toc": {
	"nav_menu": {},
	"number_sections": true,
	"sideBar": true,
	"skip_h1_title": false,
	"title_cell": "Table of Contents",
	"title_sidebar": "Contents",
	"toc_cell": false,
	"toc_position": {},
	"toc_section_display": true,
	"toc_window_display": false
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}