jerielizabeth/2017-01-06-corpus-OCR-overview-data.ipynb

## 2017-01-06-corpus-OCR-overview-data.ipynb
{
  "cells": [
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "# Code: Compute Corpus Statistics"
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-01-13T08:55:58.363395",
          "end_time": "2017-01-13T08:55:59.885535"
        },
        "collapsed": false,
        "trusted": true
      },
      "cell_type": "code",
      "source": "from collections import Counter\nfrom os import listdir\nfrom os.path import isfile, join\nimport json\nfrom datetime import date\nimport nltk\nfrom nltk.corpus import words\nfrom nltk import word_tokenize\nimport re\nimport numpy as np\n\ndef readfile(input_dir, filename):\n    with open(join(input_dir, filename)) as f:\n        return(f.read())\n\n# Function requires wordlists to be list, even if only using one file.\ndef create_spelling_dictionary(wordlists, directory):\n    spelling_dictionary = []\n    for wordlist in wordlists:\n        words = readfile(directory, wordlist).splitlines()\n        word_list = [w.lower() for w in words]\n        for each in word_list:\n            spelling_dictionary.append(each)\n    return(list(set(spelling_dictionary)))\n\ndef strip_punct(text):\n    text_cleaned = re.sub(r\"[0-9,.!?$:;&]\", \" \", text)\n    return(text_cleaned)\n\ndef tokenize_text(text):\n    return(word_tokenize(text))\n\ndef to_lower(tokens):\n    return([w.lower() for w in tokens])\n\ndef identify_errors(tokens, dictionary):\n    return(set(tokens)-set(dictionary))\n\ndef get_error_stats(errors):\n    freq_distribution = nltk.FreqDist(errors) \n    \n    error_report = {}\n    for error in list(errors):\n        error_count = freq_distribution[error]\n        error_report.update({error:error_count})\n        \n    return(error_report)    \n\ndef total_errors(error_report):\n    return(sum(error_report.values()))\n\ndef error_rate(error_total, tokens):\n    if len(tokens) > 0:\n        return(float(\"{0:.3f}\".format(error_total/len(tokens))))\n    else:\n        return(np.nan)\n    \ndef generate_doc_report(text, spelling_dictionary):\n    text = strip_punct(text)\n    tokens = tokenize_text(text)\n    tokens = to_lower(tokens)\n    errors = identify_errors(tokens, spelling_dictionary)\n    error_report = get_error_stats(errors)\n    error_total = total_errors(error_report)\n    rate = error_rate(error_total, tokens)\n    return({'num_tokens': len(tokens),\n             'num_unique_tokens': len(set(tokens)),\n             'num_errors': error_total,\n             'error_rate': rate,\n             'errors': error_report})\n\n# Function for taking a directory and a wordlist and reporting back errors and general statistics for each doc.\ndef process_directory(directory, wordlists, wordlist_dir):\n\n    # load up a list of the corpus documents\n    corpus = (f for f in listdir(directory) if not f.startswith('.') and isfile(join(directory, f)))\n    \n    # Compile the spelling dictionary from word lists\n    spelling_dictionary = create_spelling_dictionary(wordlists, wordlist_dir)\n    \n    statistics = []\n    for document in corpus:\n        content = readfile(directory, document)\n        stats = generate_doc_report(content, spelling_dictionary)\n        stats.update({\"doc_id\": document})\n        statistics.append(stats)\n \n    return(statistics)  \n\ndef get_corpus_data( input_dir, title ):\n    corpus_data = process_directory( join(input_dir, title), wordlists, wordlist_dir )\n    return( corpus_data )",
      "execution_count": 1,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "# Setup"
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-01-13T08:55:59.887109",
          "end_time": "2017-01-13T08:55:59.891084"
        },
        "collapsed": true,
        "trusted": true
      },
      "cell_type": "code",
      "source": "titles = [\"ADV\", \"AmSn\", \"ARAI\", \"CE\", \n          \"CUV\", \"EDU\", \"GCB\", \"GH\", \n          \"GOH\", \"GS\", \"HM\", \"HR\", \n          \"IR\", \"LB\", \"LH\", \"LibM\", \n          \"LUH\", \"NMN\",\"PHJ\",\"PTAR\",\n          \"PUR\",\"RH\",\"Sligo\",\"SOL\",\n          \"ST\",\"SUW\",\"TCOG\",\"TMM\",\n          \"WMH\",\"YI\"]",
      "execution_count": 2,
      "outputs": []
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-01-13T08:55:59.893151",
          "end_time": "2017-01-13T08:55:59.897615"
        },
        "collapsed": true,
        "trusted": true
      },
      "cell_type": "code",
      "source": "input_dir = \"/Users/jeriwieringa/Dissertation/text/text/2017-01-06-corpus-with-utf8-split-into-titles\"\nwordlists = [\"2016-12-07-SDA-last-names.txt\", \n             \"2016-12-07-SDA-place-names.txt\", \n             \"2016-12-08-SDA-Vocabulary.txt\", \n             \"2017-01-03-place-names.txt\", \n             \"2016-12-06-First-Word-List.txt\"]\nwordlist_dir = \"/Users/jeriwieringa/Dissertation/drafts/data/word-lists\"\nout_dir = \"/Users/jeriwieringa/Dissertation/drafts/data/corpus-statistics/2017-01-08\"",
      "execution_count": 3,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "# Generate Data"
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-01-13T08:55:59.900116",
          "end_time": "2017-01-13T08:55:59.908780"
        },
        "scrolled": false,
        "trusted": true,
        "collapsed": false
      },
      "cell_type": "code",
      "source": "def process_title(title, input_dir):\n    corpus_data = {}\n\n    print(\"\\nProcessing: {}\".format(title))\n    data = get_corpus_data(input_dir, title)\n\n    corpus_data.update({\"reports\": data})\n    yield corpus_data",
      "execution_count": 4,
      "outputs": []
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-01-13T08:55:59.910887",
          "end_time": "2017-01-13T10:59:07.409688"
        },
        "collapsed": false,
        "trusted": true
      },
      "cell_type": "code",
      "source": "import time\n\nfor title in titles:\n    outfile = \"{}-all-corpora-data.json\".format(title)\n    with open(join(out_dir, outfile), \"w\", encoding='utf-8') as o:  \n        start = time.time()\n        all_data = process_title(title, input_dir)\n        for data in all_data:\n            json.dump(data, o)\n        end = time.time()\n        print((end-start)/60)\n    o.close()",
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "text": "\nProcessing: ADV\n1.4443191488583882\n\nProcessing: AmSn\n4.08760058482488\n\nProcessing: ARAI\n0.07371393044789633\n\nProcessing: CE\n2.1874577522277834\n\nProcessing: CUV\n3.9534137805302936\n\nProcessing: EDU\n0.2939378341039022\n\nProcessing: GCB\n2.774425919850667\n\nProcessing: GH\n1.2899338324864706\n\nProcessing: GOH\n0.41394771734873453\n\nProcessing: GS\n0.39861186742782595\n\nProcessing: HM\n1.5764355142911275\n\nProcessing: HR\n12.324441564083099\n\nProcessing: IR\n0.8139488180478414\n\nProcessing: LB\n5.083624283472697\n\nProcessing: LH\n5.361722048123678\n\nProcessing: LibM\n1.7180917501449584\n\nProcessing: LUH\n3.9271430810292562\n\nProcessing: NMN\n0.24592998425165813\n\nProcessing: PHJ\n2.608540717760722\n\nProcessing: PTAR\n0.14221384922663372\n\nProcessing: PUR\n3.9020716190338134\n\nProcessing: RH\n35.2963091691335\n\nProcessing: Sligo\n0.7368343830108642\n\nProcessing: SOL\n1.4018923163414\n\nProcessing: ST\n13.384035801887512\n\nProcessing: SUW\n3.6700123151143393\n\nProcessing: TCOG\n0.9790529688199361\n\nProcessing: TMM\n1.3713114182154338\n\nProcessing: WMH\n0.6506821990013123\n\nProcessing: YI\n11.000955033302308\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "ExecuteTime": {
          "start_time": "2017-01-13T10:59:07.411479",
          "end_time": "2017-01-13T10:59:08.888726"
        },
        "collapsed": false,
        "trusted": true
      },
      "cell_type": "code",
      "source": "# %load /Users/jeriwieringa/Dissertation/drafts/code/shared_elements/system_info.py\nimport IPython\nprint (IPython.sys_info())\n!pip freeze",
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "text": "{'commit_hash': '5c9c918',\n 'commit_source': 'installation',\n 'default_encoding': 'UTF-8',\n 'ipython_path': '/Users/jeriwieringa/miniconda3/envs/dissertation2/lib/python3.5/site-packages/IPython',\n 'ipython_version': '5.1.0',\n 'os_name': 'posix',\n 'platform': 'Darwin-16.3.0-x86_64-i386-64bit',\n 'sys_executable': '/Users/jeriwieringa/miniconda3/envs/dissertation2/bin/python',\n 'sys_platform': 'darwin',\n 'sys_version': '3.5.2 |Continuum Analytics, Inc.| (default, Jul  2 2016, '\n                '17:52:12) \\n'\n                '[GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]'}\nanaconda-client==1.5.5\nappnope==0.1.0\nargh==0.26.1\nblinker==1.4\nbokeh==0.12.4\nboto==2.43.0\nbz2file==0.98\nchest==0.2.3\ncloudpickle==0.2.1\nclyent==1.2.2\ndask==0.12.0\ndatashader==0.4.0\ndatashape==0.5.2\ndecorator==4.0.10\ndocutils==0.12\ndoit==0.29.0\ngensim==0.12.4\nGhost.py==0.2.3\nghp-import2==1.0.1\ngspread==0.4.1\nHeapDict==1.0.0\nhttplib2==0.9.2\nhusl==4.0.3\nijson==2.3\nipykernel==4.5.2\nipython==5.1.0\nipython-genutils==0.1.0\nipywidgets==5.2.2\nJinja2==2.8\njsonschema==2.5.1\njupyter==1.0.0\njupyter-client==4.4.0\njupyter-console==5.0.0\njupyter-contrib-core==0.3.0\njupyter-contrib-nbextensions==0.2.2\njupyter-core==4.2.1\njupyter-highlight-selected-word==0.0.5\njupyter-latex-envs==1.3.5.4\njupyter-nbextensions-configurator==0.2.3\nllvmlite==0.14.0\nlocket==0.2.0\nLogbook==1.0.0\nlxml==3.5.0\nMacFSEvents==0.7\nMako==1.0.4\nMarkdown==2.6.7\nMarkupSafe==0.23\nmemory-profiler==0.43\nmistune==0.7.3\nmultipledispatch==0.4.9\nnatsort==4.0.4\nnb-anacondacloud==1.2.0\nnb-conda==2.0.0\nnb-conda-kernels==2.0.0\nnb-config-manager==0.1.3\nnbbrowserpdf==0.2.1\nnbconvert==4.2.0\nnbformat==4.2.0\nnbpresent==3.0.2\nnetworkx==1.11\nNikola==7.7.7\nnltk==3.2.2\nnotebook==4.2.3\nnumba==0.29.0\nnumpy==1.11.3\noauth2client==4.0.0\nodo==0.5.0\npandas==0.19.2\npartd==0.3.6\npath.py==0.0.0\npathtools==0.1.2\npexpect==4.0.1\npickleshare==0.7.4\nPillow==3.4.2\nprompt-toolkit==1.0.9\npsutil==4.3.0\nptyprocess==0.5.1\npyasn1==0.1.9\npyasn1-modules==0.0.8\npycrypto==2.6.1\nPygments==2.1.3\nPyPDF2==1.25.1\nPyRSS2Gen==1.1\npyshp==1.2.10\npython-dateutil==2.6.0\npytz==2016.10\nPyYAML==3.12\npyzmq==16.0.2\nqtconsole==4.2.1\nrequests==2.12.3\nrsa==3.4.2\nscipy==0.18.1\nsimplegeneric==0.8.1\nsix==1.10.0\nsmart-open==1.3.5\nterminado==0.6\ntextblob==0.11.1\ntoolz==0.8.1\ntornado==4.4.2\ntraitlets==4.3.1\nUnidecode==0.4.19\nverifyOCR==0.1\nwatchdog==0.8.3\nwcwidth==0.1.7\nwebassets==0.11.1\nwidgetsnbextension==1.2.6\nws4py==0.3.4\nxarray==0.8.2\nYapsy==1.11.223\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "collapsed": true,
        "trusted": true
      },
      "cell_type": "code",
      "source": "",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "toc": {
      "sideBar": false,
      "toc_cell": true,
      "nav_menu": {
        "width": "252px",
        "height": "102px"
      },
      "number_sections": true,
      "threshold": "3",
      "navigate_menu": true,
      "toc_section_display": "block",
      "toc_window_display": true
    },
    "anaconda-cloud": {},
    "_draft": {
      "nbviewer_url": "https://gist.github.com/97eeac0bf83365af7fd00bc6a0151554"
    },
    "gist": {
      "id": "97eeac0bf83365af7fd00bc6a0151554",
      "data": {
        "description": "drafts/code/ocr_evaluation_and_correction/2017-01-06-corpus-OCR-overview-data.ipynb update",
        "public": true
      }
    },
    "language_info": {
      "pygments_lexer": "ipython3",
      "file_extension": ".py",
      "version": "3.5.2",
      "mimetype": "text/x-python",
      "name": "python",
      "codemirror_mode": {
        "version": 3,
        "name": "ipython"
      },
      "nbconvert_exporter": "python"
    },
    "kernelspec": {
      "language": "python",
      "display_name": "Python [conda env:dissertation2]",
      "name": "conda-env-dissertation2-py"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}
	{
	"cells": [
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "# Code: Compute Corpus Statistics"
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-01-13T08:55:58.363395",
	"end_time": "2017-01-13T08:55:59.885535"
	},
	"collapsed": false,
	"trusted": true
	},
	"cell_type": "code",
	"source": "from collections import Counter\nfrom os import listdir\nfrom os.path import isfile, join\nimport json\nfrom datetime import date\nimport nltk\nfrom nltk.corpus import words\nfrom nltk import word_tokenize\nimport re\nimport numpy as np\n\ndef readfile(input_dir, filename):\n with open(join(input_dir, filename)) as f:\n return(f.read())\n\n# Function requires wordlists to be list, even if only using one file.\ndef create_spelling_dictionary(wordlists, directory):\n spelling_dictionary = []\n for wordlist in wordlists:\n words = readfile(directory, wordlist).splitlines()\n word_list = [w.lower() for w in words]\n for each in word_list:\n spelling_dictionary.append(each)\n return(list(set(spelling_dictionary)))\n\ndef strip_punct(text):\n text_cleaned = re.sub(r\"[0-9,.!?$:;&]\", \" \", text)\n return(text_cleaned)\n\ndef tokenize_text(text):\n return(word_tokenize(text))\n\ndef to_lower(tokens):\n return([w.lower() for w in tokens])\n\ndef identify_errors(tokens, dictionary):\n return(set(tokens)-set(dictionary))\n\ndef get_error_stats(errors):\n freq_distribution = nltk.FreqDist(errors) \n \n error_report = {}\n for error in list(errors):\n error_count = freq_distribution[error]\n error_report.update({error:error_count})\n \n return(error_report) \n\ndef total_errors(error_report):\n return(sum(error_report.values()))\n\ndef error_rate(error_total, tokens):\n if len(tokens) > 0:\n return(float(\"{0:.3f}\".format(error_total/len(tokens))))\n else:\n return(np.nan)\n \ndef generate_doc_report(text, spelling_dictionary):\n text = strip_punct(text)\n tokens = tokenize_text(text)\n tokens = to_lower(tokens)\n errors = identify_errors(tokens, spelling_dictionary)\n error_report = get_error_stats(errors)\n error_total = total_errors(error_report)\n rate = error_rate(error_total, tokens)\n return({'num_tokens': len(tokens),\n 'num_unique_tokens': len(set(tokens)),\n 'num_errors': error_total,\n 'error_rate': rate,\n 'errors': error_report})\n\n# Function for taking a directory and a wordlist and reporting back errors and general statistics for each doc.\ndef process_directory(directory, wordlists, wordlist_dir):\n\n # load up a list of the corpus documents\n corpus = (f for f in listdir(directory) if not f.startswith('.') and isfile(join(directory, f)))\n \n # Compile the spelling dictionary from word lists\n spelling_dictionary = create_spelling_dictionary(wordlists, wordlist_dir)\n \n statistics = []\n for document in corpus:\n content = readfile(directory, document)\n stats = generate_doc_report(content, spelling_dictionary)\n stats.update({\"doc_id\": document})\n statistics.append(stats)\n \n return(statistics) \n\ndef get_corpus_data( input_dir, title ):\n corpus_data = process_directory( join(input_dir, title), wordlists, wordlist_dir )\n return( corpus_data )",
	"execution_count": 1,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "# Setup"
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-01-13T08:55:59.887109",
	"end_time": "2017-01-13T08:55:59.891084"
	},
	"collapsed": true,
	"trusted": true
	},
	"cell_type": "code",
	"source": "titles = [\"ADV\", \"AmSn\", \"ARAI\", \"CE\", \n \"CUV\", \"EDU\", \"GCB\", \"GH\", \n \"GOH\", \"GS\", \"HM\", \"HR\", \n \"IR\", \"LB\", \"LH\", \"LibM\", \n \"LUH\", \"NMN\",\"PHJ\",\"PTAR\",\n \"PUR\",\"RH\",\"Sligo\",\"SOL\",\n \"ST\",\"SUW\",\"TCOG\",\"TMM\",\n \"WMH\",\"YI\"]",
	"execution_count": 2,
	"outputs": []
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-01-13T08:55:59.893151",
	"end_time": "2017-01-13T08:55:59.897615"
	},
	"collapsed": true,
	"trusted": true
	},
	"cell_type": "code",
	"source": "input_dir = \"/Users/jeriwieringa/Dissertation/text/text/2017-01-06-corpus-with-utf8-split-into-titles\"\nwordlists = [\"2016-12-07-SDA-last-names.txt\", \n \"2016-12-07-SDA-place-names.txt\", \n \"2016-12-08-SDA-Vocabulary.txt\", \n \"2017-01-03-place-names.txt\", \n \"2016-12-06-First-Word-List.txt\"]\nwordlist_dir = \"/Users/jeriwieringa/Dissertation/drafts/data/word-lists\"\nout_dir = \"/Users/jeriwieringa/Dissertation/drafts/data/corpus-statistics/2017-01-08\"",
	"execution_count": 3,
	"outputs": []
	},
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "# Generate Data"
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-01-13T08:55:59.900116",
	"end_time": "2017-01-13T08:55:59.908780"
	},
	"scrolled": false,
	"trusted": true,
	"collapsed": false
	},
	"cell_type": "code",
	"source": "def process_title(title, input_dir):\n corpus_data = {}\n\n print(\"\\nProcessing: {}\".format(title))\n data = get_corpus_data(input_dir, title)\n\n corpus_data.update({\"reports\": data})\n yield corpus_data",
	"execution_count": 4,
	"outputs": []
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-01-13T08:55:59.910887",
	"end_time": "2017-01-13T10:59:07.409688"
	},
	"collapsed": false,
	"trusted": true
	},
	"cell_type": "code",
	"source": "import time\n\nfor title in titles:\n outfile = \"{}-all-corpora-data.json\".format(title)\n with open(join(out_dir, outfile), \"w\", encoding='utf-8') as o: \n start = time.time()\n all_data = process_title(title, input_dir)\n for data in all_data:\n json.dump(data, o)\n end = time.time()\n print((end-start)/60)\n o.close()",
	"execution_count": 5,
	"outputs": [
	{
	"output_type": "stream",
	"text": "\nProcessing: ADV\n1.4443191488583882\n\nProcessing: AmSn\n4.08760058482488\n\nProcessing: ARAI\n0.07371393044789633\n\nProcessing: CE\n2.1874577522277834\n\nProcessing: CUV\n3.9534137805302936\n\nProcessing: EDU\n0.2939378341039022\n\nProcessing: GCB\n2.774425919850667\n\nProcessing: GH\n1.2899338324864706\n\nProcessing: GOH\n0.41394771734873453\n\nProcessing: GS\n0.39861186742782595\n\nProcessing: HM\n1.5764355142911275\n\nProcessing: HR\n12.324441564083099\n\nProcessing: IR\n0.8139488180478414\n\nProcessing: LB\n5.083624283472697\n\nProcessing: LH\n5.361722048123678\n\nProcessing: LibM\n1.7180917501449584\n\nProcessing: LUH\n3.9271430810292562\n\nProcessing: NMN\n0.24592998425165813\n\nProcessing: PHJ\n2.608540717760722\n\nProcessing: PTAR\n0.14221384922663372\n\nProcessing: PUR\n3.9020716190338134\n\nProcessing: RH\n35.2963091691335\n\nProcessing: Sligo\n0.7368343830108642\n\nProcessing: SOL\n1.4018923163414\n\nProcessing: ST\n13.384035801887512\n\nProcessing: SUW\n3.6700123151143393\n\nProcessing: TCOG\n0.9790529688199361\n\nProcessing: TMM\n1.3713114182154338\n\nProcessing: WMH\n0.6506821990013123\n\nProcessing: YI\n11.000955033302308\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {
	"ExecuteTime": {
	"start_time": "2017-01-13T10:59:07.411479",
	"end_time": "2017-01-13T10:59:08.888726"
	},
	"collapsed": false,
	"trusted": true
	},
	"cell_type": "code",
	"source": "# %load /Users/jeriwieringa/Dissertation/drafts/code/shared_elements/system_info.py\nimport IPython\nprint (IPython.sys_info())\n!pip freeze",
	"execution_count": 6,
	"outputs": [
	{
	"output_type": "stream",
	"text": "{'commit_hash': '5c9c918',\n 'commit_source': 'installation',\n 'default_encoding': 'UTF-8',\n 'ipython_path': '/Users/jeriwieringa/miniconda3/envs/dissertation2/lib/python3.5/site-packages/IPython',\n 'ipython_version': '5.1.0',\n 'os_name': 'posix',\n 'platform': 'Darwin-16.3.0-x86_64-i386-64bit',\n 'sys_executable': '/Users/jeriwieringa/miniconda3/envs/dissertation2/bin/python',\n 'sys_platform': 'darwin',\n 'sys_version': '3.5.2 \|Continuum Analytics, Inc.\| (default, Jul 2 2016, '\n '17:52:12) \\n'\n '[GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]'}\nanaconda-client==1.5.5\nappnope==0.1.0\nargh==0.26.1\nblinker==1.4\nbokeh==0.12.4\nboto==2.43.0\nbz2file==0.98\nchest==0.2.3\ncloudpickle==0.2.1\nclyent==1.2.2\ndask==0.12.0\ndatashader==0.4.0\ndatashape==0.5.2\ndecorator==4.0.10\ndocutils==0.12\ndoit==0.29.0\ngensim==0.12.4\nGhost.py==0.2.3\nghp-import2==1.0.1\ngspread==0.4.1\nHeapDict==1.0.0\nhttplib2==0.9.2\nhusl==4.0.3\nijson==2.3\nipykernel==4.5.2\nipython==5.1.0\nipython-genutils==0.1.0\nipywidgets==5.2.2\nJinja2==2.8\njsonschema==2.5.1\njupyter==1.0.0\njupyter-client==4.4.0\njupyter-console==5.0.0\njupyter-contrib-core==0.3.0\njupyter-contrib-nbextensions==0.2.2\njupyter-core==4.2.1\njupyter-highlight-selected-word==0.0.5\njupyter-latex-envs==1.3.5.4\njupyter-nbextensions-configurator==0.2.3\nllvmlite==0.14.0\nlocket==0.2.0\nLogbook==1.0.0\nlxml==3.5.0\nMacFSEvents==0.7\nMako==1.0.4\nMarkdown==2.6.7\nMarkupSafe==0.23\nmemory-profiler==0.43\nmistune==0.7.3\nmultipledispatch==0.4.9\nnatsort==4.0.4\nnb-anacondacloud==1.2.0\nnb-conda==2.0.0\nnb-conda-kernels==2.0.0\nnb-config-manager==0.1.3\nnbbrowserpdf==0.2.1\nnbconvert==4.2.0\nnbformat==4.2.0\nnbpresent==3.0.2\nnetworkx==1.11\nNikola==7.7.7\nnltk==3.2.2\nnotebook==4.2.3\nnumba==0.29.0\nnumpy==1.11.3\noauth2client==4.0.0\nodo==0.5.0\npandas==0.19.2\npartd==0.3.6\npath.py==0.0.0\npathtools==0.1.2\npexpect==4.0.1\npickleshare==0.7.4\nPillow==3.4.2\nprompt-toolkit==1.0.9\npsutil==4.3.0\nptyprocess==0.5.1\npyasn1==0.1.9\npyasn1-modules==0.0.8\npycrypto==2.6.1\nPygments==2.1.3\nPyPDF2==1.25.1\nPyRSS2Gen==1.1\npyshp==1.2.10\npython-dateutil==2.6.0\npytz==2016.10\nPyYAML==3.12\npyzmq==16.0.2\nqtconsole==4.2.1\nrequests==2.12.3\nrsa==3.4.2\nscipy==0.18.1\nsimplegeneric==0.8.1\nsix==1.10.0\nsmart-open==1.3.5\nterminado==0.6\ntextblob==0.11.1\ntoolz==0.8.1\ntornado==4.4.2\ntraitlets==4.3.1\nUnidecode==0.4.19\nverifyOCR==0.1\nwatchdog==0.8.3\nwcwidth==0.1.7\nwebassets==0.11.1\nwidgetsnbextension==1.2.6\nws4py==0.3.4\nxarray==0.8.2\nYapsy==1.11.223\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {
	"collapsed": true,
	"trusted": true
	},
	"cell_type": "code",
	"source": "",
	"execution_count": null,
	"outputs": []
	}
	],
	"metadata": {
	"toc": {
	"sideBar": false,
	"toc_cell": true,
	"nav_menu": {
	"width": "252px",
	"height": "102px"
	},
	"number_sections": true,
	"threshold": "3",
	"navigate_menu": true,
	"toc_section_display": "block",
	"toc_window_display": true
	},
	"anaconda-cloud": {},
	"_draft": {
	"nbviewer_url": "https://gist.github.com/97eeac0bf83365af7fd00bc6a0151554"
	},
	"gist": {
	"id": "97eeac0bf83365af7fd00bc6a0151554",
	"data": {
	"description": "drafts/code/ocr_evaluation_and_correction/2017-01-06-corpus-OCR-overview-data.ipynb update",
	"public": true
	}
	},
	"language_info": {
	"pygments_lexer": "ipython3",
	"file_extension": ".py",
	"version": "3.5.2",
	"mimetype": "text/x-python",
	"name": "python",
	"codemirror_mode": {
	"version": 3,
	"name": "ipython"
	},
	"nbconvert_exporter": "python"
	},
	"kernelspec": {
	"language": "python",
	"display_name": "Python [conda env:dissertation2]",
	"name": "conda-env-dissertation2-py"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}