Created
March 6, 2013 07:13
-
-
Save richstoner/5097346 to your computer and use it in GitHub Desktop.
Guidelines parsing example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "full_parse_nb" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# resulting data structure\n# list of chapters\n# each chapter is a dict\n# chapter name\n# chapter number \n# topic list -> array of topics\n# each topic is dict\n# \n# topic\n# topic name\n# topic id\n# preformatted html string\n# unformatted other content", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 331 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# Rich Stoner, 2013", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 332 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "import os, textwrap\nimport csv\nimport re\nfrom IPython.core.display import HTML\nimport os", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 321 | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": "### previous sqlite table\n td['tid'] = int(row[18])\n td['chapter'] = int(row[13])\n td['name'] = row[19]\n td['synonyms'] = row[20]\n td['def2'] = row[21]\n td['depi'] = row[22]\n td['deti'] = row[23]\n td['ccsi'] = row[24]\n td['ccf'] = row[25]\n td['morph'] = row[26]\n td['phe'] = row[27]\n td['gen'] = row[28]\n td['prog'] = row[29]\n td['crit2'] = row[30]\n td['critdd'] = row[31]\n td['vari'] = row[32]\n td['lab'] = row[33]\n td['ref'] = row[34]\n td['key'] = row[35]" | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": "### build new tables " | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "ff = '''\nDefinition,\nSynonym(s),\nEpidemiology,\nEtiology,\nSites of Involvement,\nClinical Features,\nMorphology,\nImmunophenotype,\nGenetics,\nPrognosis,\nCriteria'''.split(',')\nfullnames = []\nfor f in ff:\n fullnames.append(f.strip())\n\nt = '''definition_svg,\nsynonym_svg,\nepidemiology_svg,\netiology_svg,\nsites_svg,\nclinical_svg,\nmorphology_svg,\nimmmunophenotype_svg,\ngenetics_svg,\nprognosis_svg,\ncriteria_svg,\ndiagnosis_summary,\nkey_points,\nreference_page,\nreferences,\nurl,\nsynonym,\ndefinition,\nepidemiology,\netiology,\nsites,\nclinical,\nmorphology,\nimmunophenotype,\ngenetics,\nprognosis,\ncriteria,\nddx,\nlabtest,\nvariants,\ntags,\ncheck,\nfound,\nchapter,\npage,\nchapter_name,\nsection,\ntopic_number,\ntopic_name,\ntopic_summ,\ntopic_summ_template'''\n\n\nall_fields = []\nfor ss in t.split(','):\n# print ss.strip()\n all_fields.append(ss.strip())\n \nsvg_fields = []\nfor a in all_fields:\n if 'svg' in a:\n svg_fields.append(a)\n", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 330 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# header string for html formatting\nheaderstr = '''\n\n<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n<html lang=\"en\">\n <head>\n <meta name=\"generator\" content=\"HTML Tidy for Mac OS X (vers 31 October 2006 - Apple Inc. build 15.10), see www.w3.org\">\n <link rel=\"stylesheet\" type=\"text/css\" href=\"../bootstrap/css/bootstrap.css\">\n <link rel=\"stylesheet\" type=\"text/css\" href=\"../bootstrap/css/bootstrap-responsive.css\">\n\n<style type=\"text/css\">\n\n body {\n background-color: #eee;\n /* font-size: 80%;*/\n }\n\n .topic-container {\n/* margin: 20px auto;*/\n margin:0px;\n background-color: #eee;\n padding: 5px;\n padding-left:10px;\n padding-right:10px;\n border-radius: 0px;\n }\n\n .subtopic-container {\n margin: 5px;\n margin-bottom: 20px;\n padding: 2px 20px 20px 20px;\n background-color: #fff;\n border-radius: 5px;\n box-shadow: 0px 4px 10px #999;\n }\n\n h1,h2,h3,h4,h5\n {\n font-family: \"HelveticaNeue-CondensedBold\";\n }\n\n </style>'''", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 333 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from BeautifulSoup import BeautifulSoup\n\n# figured as much\n#In ASCII, 	 is a tab.\n\ndef getParsedArrays(xmlString, _i, _d):\n soup = BeautifulSoup(xmlString)\n\n # parse style here\n styleList = soup.findAll('style')\n stylestring = ''\n if len(styleList) > 0:\n \n stylestring = '<style type=\"text/css\">\\n'\n for n,t in enumerate(styleList):\n cssClassName = '.svg-%s-%d-%d ' % (str(n), _i, _d)\n \n cssContent = t.contents[0].split(',')[0]\n cssContent = cssContent.replace('font-family: \\'Verdana\\';', '')\n cssContent = cssContent.replace('font-size: 16px;', '')\n cssContent = cssContent.replace('font-size: 8px;','')\n \n if len(cssContent) > 2:\n \n stylestring += cssClassName\n stylestring += '{'\n stylestring += cssContent\n stylestring += '}\\n'\n \n stylestring = stylestring.replace('\"', '')\n stylestring = stylestring + '</style>'\n\n # parse content here\n spanlist = soup.findAll('span')\n \n if len(spanlist) > 0:\n \n bodystr = ''\n bodystr += '<div class=\"subtopic-container\"><h3>%s</h3>' % fullnames[_i]\n \n \n for n,t in enumerate(spanlist):\n tempstr = ''\n for c in t.contents:\n \n tempstr += str(c)\n \n datastr = '<span class=\"svg-%s-%d-%d\">%s</span>' % (str(n), _i, _d, str(tempstr))\n bodystr += datastr\n \n bodystr += '</div>'\n bodystr = bodystr.replace('	', ' ') \n\n return [stylestring, bodystr]\n else:\n return ['','']\n", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 325 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# build list of chapters, print chapters that don't have chapter name or number set\npv_file = 'fullexport.csv'\n\ndef getChapterList(csv_input):\n \n chapter_index = all_fields.index('chapter')\n chapter_name_index = all_fields.index('chapter_name')\n \n chapter_list = []\n \n csvReader = csv.reader(open(csv_input, 'r'), delimiter=',')\n \n for row in csvReader:\n chapter_list.append([row[chapter_index], row[chapter_name_index]])\n \n if row[33] == '':\n print row[38]\n \n \n return chapter_list\n\nchapterList = getChapterList(pv_file)\n ", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Chronic myeloid leukemia, BCR-ABL1-positive (CML)" | |
}, | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "\nMyelodysplastic Syndrome (General)\nPrimary Cutaneous Marginal Zone Lymphoma (MZL)\nLangerhans cell histiocytosis (LCH)\nLangerhans Cell Sarcoma (LCS)\nInterdigitating Dendritic Cell Sarcoma (IDCS)\nFibroblastic Reticular Cell Tumor (FRCT)\nIndeterminate Dendritic Cell Tumor (IDCT)\n" | |
} | |
], | |
"prompt_number": 327 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# rebuild cleanly formatted chapter list\nchapter_index = all_fields.index('chapter')\nchapter_name_index = all_fields.index('chapter_name')\n\nchaps = ['02', '03', '04', '05', '06', '07', '09', '10.1', '10.2', '11', '12', '13', '14']\n\ntitles = []\ntitles += ['Myeloproliferative Neoplasms (MPN)']\ntitles += ['Myeloid and Lymphoid Neoplasms with Eosinophilia and Abnormalities of PDGFRA, PDGFRB or FGFR1']\ntitles += ['Myelodysplastic-Myeloproliferative neoplasms (MDS/MPN)']\ntitles += ['Myelodysplastic syndromes (MDS)']\ntitles += ['Acute myeloid leukemia (AML) and related precursor neoplasms']\ntitles += ['Acute leukemias of ambiguous lineage']\ntitles += ['Precursor Lymphoid Neoplasms']\ntitles += ['Mature B-cell Neoplasms']\ntitles += ['Plasma cell Neoplasms']\ntitles += ['Mature T- and NK-cell neoplasms']\ntitles += ['Hodgkin Lymphoma']\ntitles += ['Immunodeficiency-associated lymphoproliferative disorders']\ntitles += ['Histiocytic and dendritic cell neoplasms']\n\nCHAPTERS = []\n\n\ndef buildTopicDict(row, i):\n ''' this function generates all of the topic metadata from a CSV row'''\n\n td = {}\n td['name'] = row[all_fields.index('topic_name')]\n td['id'] = row[all_fields.index('topic_number')]\n td['section'] = row[all_fields.index('section')] \n td['html'] = 'test'\n \n raw_list = ['definition', 'synonym', 'topic_name', 'epidemiology', 'etiology', 'sites', 'clinical', 'immunophenotype', 'genetics', 'prognosis', 'morphology', 'criteria', 'ddx', 'variants', 'labtest'] \n raw_dict = {}\n for j,r in enumerate(raw_list):\n raw_dict[r] = (row[all_fields.index(r)])\n td['raw'] = raw_dict\n \n formatted_html = headerstr\n \n style_html = ''\n body_html = ''\n body_html += '<div class=\"topic-container\"><h1>%s</h1>' % (td['name'])\n \n for n,a in enumerate(svg_fields):\n [partial_style, partial_body] = getParsedArrays(row[all_fields.index(a)], n, i)\n \n style_html += partial_style\n body_html += partial_body\n \n body_html += '</div>'\n \n formatted_html += style_html\n formatted_html += '<body><div class=\"container\"><div class=\"row\"><div class=\"span12\">'\n \n formatted_html += body_html.decode('utf-8')\n formatted_html += '</div></div></div></body>'\n \n td['html'] = formatted_html\n \n return td\n\n# compile chapters and topic into one large data object\nfor n,c in enumerate(chaps):\n \n chapter = {}\n chapter['title'] = titles[n]\n chapter['index'] = c\n \n csvReader = csv.reader(open(pv_file, 'r'), delimiter=',')\n \n chapter_info = []\n \n for row in csvReader:\n \n if c in row[chapter_name_index]:\n chapter_info.append(buildTopicDict(row, n))\n \n elif c == '02' and 'CML' in row[38]:\n chapter_info.append(buildTopicDict(row,n)) \n \n elif c == '05' and 'Myelodysplastic Syndrome (General)' == row[38]:\n chapter_info.append(buildTopicDict(row,n))\n \n # add additional special cases here \n \n chapter['topics'] = chapter_info\n CHAPTERS.append(chapter) ", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 328 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "# export to html for review\n!rm -rf html/*\n!unzip bootstrap.zip\n!mv bootstrap html/\n\nfor C in CHAPTERS: \n _c = C['index'].replace('.', '_')\n basedir = 'html/'\n os.mkdir(basedir + _c)\n for T in C['topics']:\n htmlname = basedir + '%s/%s.html' % (_c, T['id'])\n o = open(htmlname, 'w')\n o.write(str(T['html'].encode('utf-8')))\n o.close()", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Archive: bootstrap.zip\r\n creating: bootstrap/\r\n creating: bootstrap/css/\r\n inflating: bootstrap/css/bootstrap-responsive.css \r\n inflating: bootstrap/css/bootstrap-responsive.min.css \r\n inflating: bootstrap/css/bootstrap.css \r\n inflating: bootstrap/css/bootstrap.min.css \r\n creating: bootstrap/img/\r\n inflating: bootstrap/img/glyphicons-halflings-white.png \r\n inflating: bootstrap/img/glyphicons-halflings.png \r\n creating: bootstrap/js/\r\n inflating: bootstrap/js/bootstrap.js \r\n inflating: bootstrap/js/bootstrap.min.js \r\n" | |
} | |
], | |
"prompt_number": 334 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment