Skip to content

Instantly share code, notes, and snippets.

@richstoner
Created March 6, 2013 07:13
Show Gist options
  • Save richstoner/5097346 to your computer and use it in GitHub Desktop.
Save richstoner/5097346 to your computer and use it in GitHub Desktop.
Guidelines parsing example
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "full_parse_nb"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": "# resulting data structure\n# list of chapters\n# each chapter is a dict\n# chapter name\n# chapter number \n# topic list -> array of topics\n# each topic is dict\n# \n# topic\n# topic name\n# topic id\n# preformatted html string\n# unformatted other content",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 331
},
{
"cell_type": "code",
"collapsed": false,
"input": "# Rich Stoner, 2013",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 332
},
{
"cell_type": "code",
"collapsed": false,
"input": "import os, textwrap\nimport csv\nimport re\nfrom IPython.core.display import HTML\nimport os",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 321
},
{
"cell_type": "markdown",
"metadata": {},
"source": "### previous sqlite table\n td['tid'] = int(row[18])\n td['chapter'] = int(row[13])\n td['name'] = row[19]\n td['synonyms'] = row[20]\n td['def2'] = row[21]\n td['depi'] = row[22]\n td['deti'] = row[23]\n td['ccsi'] = row[24]\n td['ccf'] = row[25]\n td['morph'] = row[26]\n td['phe'] = row[27]\n td['gen'] = row[28]\n td['prog'] = row[29]\n td['crit2'] = row[30]\n td['critdd'] = row[31]\n td['vari'] = row[32]\n td['lab'] = row[33]\n td['ref'] = row[34]\n td['key'] = row[35]"
},
{
"cell_type": "markdown",
"metadata": {},
"source": "### build new tables "
},
{
"cell_type": "code",
"collapsed": false,
"input": "ff = '''\nDefinition,\nSynonym(s),\nEpidemiology,\nEtiology,\nSites of Involvement,\nClinical Features,\nMorphology,\nImmunophenotype,\nGenetics,\nPrognosis,\nCriteria'''.split(',')\nfullnames = []\nfor f in ff:\n fullnames.append(f.strip())\n\nt = '''definition_svg,\nsynonym_svg,\nepidemiology_svg,\netiology_svg,\nsites_svg,\nclinical_svg,\nmorphology_svg,\nimmmunophenotype_svg,\ngenetics_svg,\nprognosis_svg,\ncriteria_svg,\ndiagnosis_summary,\nkey_points,\nreference_page,\nreferences,\nurl,\nsynonym,\ndefinition,\nepidemiology,\netiology,\nsites,\nclinical,\nmorphology,\nimmunophenotype,\ngenetics,\nprognosis,\ncriteria,\nddx,\nlabtest,\nvariants,\ntags,\ncheck,\nfound,\nchapter,\npage,\nchapter_name,\nsection,\ntopic_number,\ntopic_name,\ntopic_summ,\ntopic_summ_template'''\n\n\nall_fields = []\nfor ss in t.split(','):\n# print ss.strip()\n all_fields.append(ss.strip())\n \nsvg_fields = []\nfor a in all_fields:\n if 'svg' in a:\n svg_fields.append(a)\n",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 330
},
{
"cell_type": "code",
"collapsed": false,
"input": "# header string for html formatting\nheaderstr = '''\n\n<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n<html lang=\"en\">\n <head>\n <meta name=\"generator\" content=\"HTML Tidy for Mac OS X (vers 31 October 2006 - Apple Inc. build 15.10), see www.w3.org\">\n <link rel=\"stylesheet\" type=\"text/css\" href=\"../bootstrap/css/bootstrap.css\">\n <link rel=\"stylesheet\" type=\"text/css\" href=\"../bootstrap/css/bootstrap-responsive.css\">\n\n<style type=\"text/css\">\n\n body {\n background-color: #eee;\n /* font-size: 80%;*/\n }\n\n .topic-container {\n/* margin: 20px auto;*/\n margin:0px;\n background-color: #eee;\n padding: 5px;\n padding-left:10px;\n padding-right:10px;\n border-radius: 0px;\n }\n\n .subtopic-container {\n margin: 5px;\n margin-bottom: 20px;\n padding: 2px 20px 20px 20px;\n background-color: #fff;\n border-radius: 5px;\n box-shadow: 0px 4px 10px #999;\n }\n\n h1,h2,h3,h4,h5\n {\n font-family: \"HelveticaNeue-CondensedBold\";\n }\n\n </style>'''",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 333
},
{
"cell_type": "code",
"collapsed": false,
"input": "from BeautifulSoup import BeautifulSoup\n\n# figured as much\n#In ASCII, &#09; is a tab.\n\ndef getParsedArrays(xmlString, _i, _d):\n soup = BeautifulSoup(xmlString)\n\n # parse style here\n styleList = soup.findAll('style')\n stylestring = ''\n if len(styleList) > 0:\n \n stylestring = '<style type=\"text/css\">\\n'\n for n,t in enumerate(styleList):\n cssClassName = '.svg-%s-%d-%d ' % (str(n), _i, _d)\n \n cssContent = t.contents[0].split(',')[0]\n cssContent = cssContent.replace('font-family: \\'Verdana\\';', '')\n cssContent = cssContent.replace('font-size: 16px;', '')\n cssContent = cssContent.replace('font-size: 8px;','')\n \n if len(cssContent) > 2:\n \n stylestring += cssClassName\n stylestring += '{'\n stylestring += cssContent\n stylestring += '}\\n'\n \n stylestring = stylestring.replace('\"', '')\n stylestring = stylestring + '</style>'\n\n # parse content here\n spanlist = soup.findAll('span')\n \n if len(spanlist) > 0:\n \n bodystr = ''\n bodystr += '<div class=\"subtopic-container\"><h3>%s</h3>' % fullnames[_i]\n \n \n for n,t in enumerate(spanlist):\n tempstr = ''\n for c in t.contents:\n \n tempstr += str(c)\n \n datastr = '<span class=\"svg-%s-%d-%d\">%s</span>' % (str(n), _i, _d, str(tempstr))\n bodystr += datastr\n \n bodystr += '</div>'\n bodystr = bodystr.replace('&#9;', '&nbsp;&nbsp;&nbsp;&nbsp;') \n\n return [stylestring, bodystr]\n else:\n return ['','']\n",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 325
},
{
"cell_type": "code",
"collapsed": false,
"input": "# build list of chapters, print chapters that don't have chapter name or number set\npv_file = 'fullexport.csv'\n\ndef getChapterList(csv_input):\n \n chapter_index = all_fields.index('chapter')\n chapter_name_index = all_fields.index('chapter_name')\n \n chapter_list = []\n \n csvReader = csv.reader(open(csv_input, 'r'), delimiter=',')\n \n for row in csvReader:\n chapter_list.append([row[chapter_index], row[chapter_name_index]])\n \n if row[33] == '':\n print row[38]\n \n \n return chapter_list\n\nchapterList = getChapterList(pv_file)\n ",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "Chronic myeloid leukemia, BCR-ABL1-positive (CML)"
},
{
"output_type": "stream",
"stream": "stdout",
"text": "\nMyelodysplastic Syndrome (General)\nPrimary Cutaneous Marginal Zone Lymphoma (MZL)\nLangerhans cell histiocytosis (LCH)\nLangerhans Cell Sarcoma (LCS)\nInterdigitating Dendritic Cell Sarcoma (IDCS)\nFibroblastic Reticular Cell Tumor (FRCT)\nIndeterminate Dendritic Cell Tumor (IDCT)\n"
}
],
"prompt_number": 327
},
{
"cell_type": "code",
"collapsed": false,
"input": "# rebuild cleanly formatted chapter list\nchapter_index = all_fields.index('chapter')\nchapter_name_index = all_fields.index('chapter_name')\n\nchaps = ['02', '03', '04', '05', '06', '07', '09', '10.1', '10.2', '11', '12', '13', '14']\n\ntitles = []\ntitles += ['Myeloproliferative Neoplasms (MPN)']\ntitles += ['Myeloid and Lymphoid Neoplasms with Eosinophilia and Abnormalities of PDGFRA, PDGFRB or FGFR1']\ntitles += ['Myelodysplastic-Myeloproliferative neoplasms (MDS/MPN)']\ntitles += ['Myelodysplastic syndromes (MDS)']\ntitles += ['Acute myeloid leukemia (AML) and related precursor neoplasms']\ntitles += ['Acute leukemias of ambiguous lineage']\ntitles += ['Precursor Lymphoid Neoplasms']\ntitles += ['Mature B-cell Neoplasms']\ntitles += ['Plasma cell Neoplasms']\ntitles += ['Mature T- and NK-cell neoplasms']\ntitles += ['Hodgkin Lymphoma']\ntitles += ['Immunodeficiency-associated lymphoproliferative disorders']\ntitles += ['Histiocytic and dendritic cell neoplasms']\n\nCHAPTERS = []\n\n\ndef buildTopicDict(row, i):\n ''' this function generates all of the topic metadata from a CSV row'''\n\n td = {}\n td['name'] = row[all_fields.index('topic_name')]\n td['id'] = row[all_fields.index('topic_number')]\n td['section'] = row[all_fields.index('section')] \n td['html'] = 'test'\n \n raw_list = ['definition', 'synonym', 'topic_name', 'epidemiology', 'etiology', 'sites', 'clinical', 'immunophenotype', 'genetics', 'prognosis', 'morphology', 'criteria', 'ddx', 'variants', 'labtest'] \n raw_dict = {}\n for j,r in enumerate(raw_list):\n raw_dict[r] = (row[all_fields.index(r)])\n td['raw'] = raw_dict\n \n formatted_html = headerstr\n \n style_html = ''\n body_html = ''\n body_html += '<div class=\"topic-container\"><h1>%s</h1>' % (td['name'])\n \n for n,a in enumerate(svg_fields):\n [partial_style, partial_body] = getParsedArrays(row[all_fields.index(a)], n, i)\n \n style_html += partial_style\n body_html += partial_body\n \n body_html += '</div>'\n \n formatted_html += style_html\n formatted_html += '<body><div class=\"container\"><div class=\"row\"><div class=\"span12\">'\n \n formatted_html += body_html.decode('utf-8')\n formatted_html += '</div></div></div></body>'\n \n td['html'] = formatted_html\n \n return td\n\n# compile chapters and topic into one large data object\nfor n,c in enumerate(chaps):\n \n chapter = {}\n chapter['title'] = titles[n]\n chapter['index'] = c\n \n csvReader = csv.reader(open(pv_file, 'r'), delimiter=',')\n \n chapter_info = []\n \n for row in csvReader:\n \n if c in row[chapter_name_index]:\n chapter_info.append(buildTopicDict(row, n))\n \n elif c == '02' and 'CML' in row[38]:\n chapter_info.append(buildTopicDict(row,n)) \n \n elif c == '05' and 'Myelodysplastic Syndrome (General)' == row[38]:\n chapter_info.append(buildTopicDict(row,n))\n \n # add additional special cases here \n \n chapter['topics'] = chapter_info\n CHAPTERS.append(chapter) ",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 328
},
{
"cell_type": "code",
"collapsed": false,
"input": "# export to html for review\n!rm -rf html/*\n!unzip bootstrap.zip\n!mv bootstrap html/\n\nfor C in CHAPTERS: \n _c = C['index'].replace('.', '_')\n basedir = 'html/'\n os.mkdir(basedir + _c)\n for T in C['topics']:\n htmlname = basedir + '%s/%s.html' % (_c, T['id'])\n o = open(htmlname, 'w')\n o.write(str(T['html'].encode('utf-8')))\n o.close()",
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": "Archive: bootstrap.zip\r\n creating: bootstrap/\r\n creating: bootstrap/css/\r\n inflating: bootstrap/css/bootstrap-responsive.css \r\n inflating: bootstrap/css/bootstrap-responsive.min.css \r\n inflating: bootstrap/css/bootstrap.css \r\n inflating: bootstrap/css/bootstrap.min.css \r\n creating: bootstrap/img/\r\n inflating: bootstrap/img/glyphicons-halflings-white.png \r\n inflating: bootstrap/img/glyphicons-halflings.png \r\n creating: bootstrap/js/\r\n inflating: bootstrap/js/bootstrap.js \r\n inflating: bootstrap/js/bootstrap.min.js \r\n"
}
],
"prompt_number": 334
},
{
"cell_type": "code",
"collapsed": false,
"input": "",
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment