Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jerielizabeth/c1ccd516681bf311630533be2bdb23d8 to your computer and use it in GitHub Desktop.
Save jerielizabeth/c1ccd516681bf311630533be2bdb23d8 to your computer and use it in GitHub Desktop.
drafts/code/ocr_evaluation_and_correction/2017-01-08-Visualize-Corpus-OCR-Statistics.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "# Code: Generate Visualizations"
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-09T13:29:06.775376",
"start_time": "2017-01-09T13:29:05.640315"
},
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "import pandas as pd\nfrom bokeh.charts import Histogram, Bar, Scatter \nfrom bokeh.plotting import figure, output_file, save\nfrom bokeh.models import HoverTool, Range1d, NumeralTickFormatter\nfrom bokeh.io import output_notebook\nfrom bokeh.layouts import gridplot\nfrom datetime import date\n\noutput_file(\"{}-plot.html\".format(str(date.today())), title='Bokeh Plot', mode='cdn')\n\ndef stats_to_df(corpus_statistics):\n df = pd.DataFrame( corpus_statistics, columns=[ \"doc_id\", \"error_rate\", \"num_tokens\", \"num_errors\" ] )\n return(df)\n\ndef chart_error_rate_distribution( df, title ):\n \n df = df[pd.notnull(df['error_rate'])]\n \n # graph the distribution of the error rates\n p = Histogram(df, \n values='error_rate',\n color='lime',\n title=\"Distribution of error rates for {}\".format(title) )\n# show(p)\n p.x_range = Range1d(-0.01,1)\n \n return(p)\n \ndef chart_error_rate_per_doc( df, title ):\n \n # sort df by doc_id\n df = df.sort_values(by='doc_id')\n \n # graph the error_rate by the doc_id \n tooltips=[\n (\"doc_id\", \"@doc_id\"), \n (\"error_rate\", \"@error_rate\")\n ]\n p = Scatter(df, x=\"doc_id\", y=\"error_rate\", \n ylabel=\"Error Rate\",\n plot_width=900, plot_height=1000,\n tooltips = tooltips,\n title=\"Error rate per document for {}\".format(title)\n )\n p.y_range = Range1d(-0.01,1)\n p.yaxis[0].formatter = NumeralTickFormatter(format=\"0.000\")\n p.xaxis.visible = False\n# show( p )\n return(p)",
"execution_count": 1,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-09T13:29:06.787950",
"start_time": "2017-01-09T13:29:06.777036"
},
"trusted": true,
"collapsed": true
},
"cell_type": "code",
"source": "from os import listdir\nfrom os.path import isfile, join\nimport ijson\nimport json\n\ninput_dir = \"/Users/jeriwieringa/Dissertation/drafts/data/corpus-statistics/2017-01-08\"",
"execution_count": 2,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-09T13:29:06.801897",
"start_time": "2017-01-09T13:29:06.793415"
},
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "corpus = (f for f in listdir(input_dir) if not f.startswith('.') and isfile(join(input_dir, f)))",
"execution_count": 3,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-09T13:31:38.720815",
"start_time": "2017-01-09T13:29:06.805681"
},
"scrolled": false,
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "for document in corpus: \n title = document.split(\"-\")[0]\n print(\"\\nStatistics for {}\".format(title))\n \n with open(join(input_dir, document), encoding=\"utf-8\") as f:\n data = json.load(f)\n\n df = stats_to_df( data[\"reports\"] )\n \n hist = chart_error_rate_distribution( df, title )\n scatter = chart_error_rate_per_doc( df, title )\n \n save(hist, \n filename = 'visualizations/{}-{}-error-rate-distribution.html'.format(str(date.today()), title),\n title=\"Distribution of error rates for {}\".format(title))\n save(scatter, \n filename = 'visualizations/{}-{}-error-rate-per-document.html'.format(str(date.today()), title),\n title=\"Error rate per doc for {}\".format(title))",
"execution_count": 4,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "\nStatistics for ADV\n\nStatistics for AmSn\n\nStatistics for ARAI\n\nStatistics for CE\n\nStatistics for CUV\n\nStatistics for EDU\n\nStatistics for GCB\n\nStatistics for GH\n\nStatistics for GOH\n\nStatistics for GS\n\nStatistics for HM\n\nStatistics for HR\n\nStatistics for IR\n\nStatistics for LB\n\nStatistics for LH\n\nStatistics for LibM\n\nStatistics for LUH\n\nStatistics for NMN\n\nStatistics for PHJ\n\nStatistics for PTAR\n\nStatistics for PUR\n\nStatistics for RH\n\nStatistics for Sligo\n\nStatistics for SOL\n\nStatistics for ST\n\nStatistics for SUW\n\nStatistics for TCOG\n\nStatistics for TMM\n\nStatistics for WMH\n\nStatistics for YI\n"
}
]
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2017-01-09T13:31:40.010586",
"start_time": "2017-01-09T13:31:38.722777"
},
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "# %load ../shared_elements/system_info.py\nimport IPython\nprint (IPython.sys_info())\n!pip freeze",
"execution_count": 5,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": "{'commit_hash': '5c9c918',\n 'commit_source': 'installation',\n 'default_encoding': 'UTF-8',\n 'ipython_path': '/Users/jeriwieringa/miniconda3/envs/dissertation2/lib/python3.5/site-packages/IPython',\n 'ipython_version': '5.1.0',\n 'os_name': 'posix',\n 'platform': 'Darwin-16.3.0-x86_64-i386-64bit',\n 'sys_executable': '/Users/jeriwieringa/miniconda3/envs/dissertation2/bin/python',\n 'sys_platform': 'darwin',\n 'sys_version': '3.5.2 |Continuum Analytics, Inc.| (default, Jul 2 2016, '\n '17:52:12) \\n'\n '[GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]'}\nanaconda-client==1.5.5\nappnope==0.1.0\nargh==0.26.1\nblinker==1.4\nbokeh==0.12.4\nboto==2.43.0\nbz2file==0.98\nchest==0.2.3\ncloudpickle==0.2.1\nclyent==1.2.2\ndask==0.12.0\ndatashader==0.4.0\ndatashape==0.5.2\ndecorator==4.0.10\ndocutils==0.12\ndoit==0.29.0\ngensim==0.12.4\nGhost.py==0.2.3\nghp-import2==1.0.1\ngspread==0.4.1\nHeapDict==1.0.0\nhttplib2==0.9.2\nhusl==4.0.3\nijson==2.3\nipykernel==4.5.2\nipython==5.1.0\nipython-genutils==0.1.0\nipywidgets==5.2.2\nJinja2==2.8\njsonschema==2.5.1\njupyter==1.0.0\njupyter-client==4.4.0\njupyter-console==5.0.0\njupyter-contrib-core==0.3.0\njupyter-contrib-nbextensions==0.2.2\njupyter-core==4.2.1\njupyter-highlight-selected-word==0.0.5\njupyter-latex-envs==1.3.5.4\njupyter-nbextensions-configurator==0.2.3\nllvmlite==0.14.0\nlocket==0.2.0\nLogbook==1.0.0\nlxml==3.5.0\nMacFSEvents==0.7\nMako==1.0.4\nMarkdown==2.6.7\nMarkupSafe==0.23\nmemory-profiler==0.41\nmistune==0.7.3\nmultipledispatch==0.4.9\nnatsort==4.0.4\nnb-anacondacloud==1.2.0\nnb-conda==2.0.0\nnb-conda-kernels==2.0.0\nnb-config-manager==0.1.3\nnbbrowserpdf==0.2.1\nnbconvert==4.2.0\nnbformat==4.2.0\nnbpresent==3.0.2\nnetworkx==1.11\nNikola==7.7.7\nnltk==3.2.1\nnotebook==4.2.3\nnumba==0.29.0\nnumpy==1.11.2\noauth2client==4.0.0\nodo==0.5.0\npandas==0.19.2\npartd==0.3.6\npath.py==0.0.0\npathtools==0.1.2\npexpect==4.0.1\npickleshare==0.7.4\nPillow==3.4.2\nprompt-toolkit==1.0.9\npsutil==4.3.0\nptyprocess==0.5.1\npyasn1==0.1.9\npyasn1-modules==0.0.8\npycrypto==2.6.1\nPygments==2.1.3\nPyPDF2==1.25.1\nPyRSS2Gen==1.1\npyshp==1.2.10\npython-dateutil==2.6.0\npytz==2016.10\nPyYAML==3.12\npyzmq==16.0.2\nqtconsole==4.2.1\nrequests==2.12.3\nrsa==3.4.2\nscipy==0.18.1\nsimplegeneric==0.8.1\nsix==1.10.0\nsmart-open==1.3.5\nterminado==0.6\ntextblob==0.11.1\ntoolz==0.8.1\ntornado==4.4.2\ntraitlets==4.3.1\nUnidecode==0.4.19\nwatchdog==0.8.3\nwcwidth==0.1.7\nwebassets==0.11.1\nwidgetsnbextension==1.2.6\nws4py==0.3.4\nxarray==0.8.2\nYapsy==1.11.223\n"
}
]
}
],
"metadata": {
"language_info": {
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2",
"file_extension": ".py",
"name": "python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"mimetype": "text/x-python"
},
"kernelspec": {
"name": "conda-env-dissertation2-py",
"display_name": "Python [conda env:dissertation2]",
"language": "python"
},
"anaconda-cloud": {},
"gist": {
"id": "",
"data": {
"description": "drafts/code/ocr_evaluation_and_correction/2017-01-08-Visualize-Corpus-OCR-Statistics.ipynb",
"public": true
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment