Skip to content

Instantly share code, notes, and snippets.

@jerielizabeth
Last active January 13, 2017 19:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jerielizabeth/97eeac0bf83365af7fd00bc6a0151554 to your computer and use it in GitHub Desktop.
Save jerielizabeth/97eeac0bf83365af7fd00bc6a0151554 to your computer and use it in GitHub Desktop.
drafts/code/ocr_evaluation_and_correction/2017-01-06-corpus-OCR-overview-data.ipynb update
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {},
"cell_type": "markdown",
"source": "# Code: Compute Corpus Statistics"
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-13T08:55:58.363395",
"end_time": "2017-01-13T08:55:59.885535"
},
"collapsed": false,
"trusted": true
},
"cell_type": "code",
"source": "from collections import Counter\nfrom os import listdir\nfrom os.path import isfile, join\nimport json\nfrom datetime import date\nimport nltk\nfrom nltk.corpus import words\nfrom nltk import word_tokenize\nimport re\nimport numpy as np\n\ndef readfile(input_dir, filename):\n with open(join(input_dir, filename)) as f:\n return(f.read())\n\n# Function requires wordlists to be list, even if only using one file.\ndef create_spelling_dictionary(wordlists, directory):\n spelling_dictionary = []\n for wordlist in wordlists:\n words = readfile(directory, wordlist).splitlines()\n word_list = [w.lower() for w in words]\n for each in word_list:\n spelling_dictionary.append(each)\n return(list(set(spelling_dictionary)))\n\ndef strip_punct(text):\n text_cleaned = re.sub(r\"[0-9,.!?$:;&]\", \" \", text)\n return(text_cleaned)\n\ndef tokenize_text(text):\n return(word_tokenize(text))\n\ndef to_lower(tokens):\n return([w.lower() for w in tokens])\n\ndef identify_errors(tokens, dictionary):\n return(set(tokens)-set(dictionary))\n\ndef get_error_stats(errors):\n freq_distribution = nltk.FreqDist(errors) \n \n error_report = {}\n for error in list(errors):\n error_count = freq_distribution[error]\n error_report.update({error:error_count})\n \n return(error_report) \n\ndef total_errors(error_report):\n return(sum(error_report.values()))\n\ndef error_rate(error_total, tokens):\n if len(tokens) > 0:\n return(float(\"{0:.3f}\".format(error_total/len(tokens))))\n else:\n return(np.nan)\n \ndef generate_doc_report(text, spelling_dictionary):\n text = strip_punct(text)\n tokens = tokenize_text(text)\n tokens = to_lower(tokens)\n errors = identify_errors(tokens, spelling_dictionary)\n error_report = get_error_stats(errors)\n error_total = total_errors(error_report)\n rate = error_rate(error_total, tokens)\n return({'num_tokens': len(tokens),\n 'num_unique_tokens': len(set(tokens)),\n 'num_errors': error_total,\n 'error_rate': rate,\n 'errors': error_report})\n\n# Function for taking a directory and a wordlist and reporting back errors and general statistics for each doc.\ndef process_directory(directory, wordlists, wordlist_dir):\n\n # load up a list of the corpus documents\n corpus = (f for f in listdir(directory) if not f.startswith('.') and isfile(join(directory, f)))\n \n # Compile the spelling dictionary from word lists\n spelling_dictionary = create_spelling_dictionary(wordlists, wordlist_dir)\n \n statistics = []\n for document in corpus:\n content = readfile(directory, document)\n stats = generate_doc_report(content, spelling_dictionary)\n stats.update({\"doc_id\": document})\n statistics.append(stats)\n \n return(statistics) \n\ndef get_corpus_data( input_dir, title ):\n corpus_data = process_directory( join(input_dir, title), wordlists, wordlist_dir )\n return( corpus_data )",
"execution_count": 1,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "# Setup"
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-13T08:55:59.887109",
"end_time": "2017-01-13T08:55:59.891084"
},
"collapsed": true,
"trusted": true
},
"cell_type": "code",
"source": "titles = [\"ADV\", \"AmSn\", \"ARAI\", \"CE\", \n \"CUV\", \"EDU\", \"GCB\", \"GH\", \n \"GOH\", \"GS\", \"HM\", \"HR\", \n \"IR\", \"LB\", \"LH\", \"LibM\", \n \"LUH\", \"NMN\",\"PHJ\",\"PTAR\",\n \"PUR\",\"RH\",\"Sligo\",\"SOL\",\n \"ST\",\"SUW\",\"TCOG\",\"TMM\",\n \"WMH\",\"YI\"]",
"execution_count": 2,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-13T08:55:59.893151",
"end_time": "2017-01-13T08:55:59.897615"
},
"collapsed": true,
"trusted": true
},
"cell_type": "code",
"source": "input_dir = \"/Users/jeriwieringa/Dissertation/text/text/2017-01-06-corpus-with-utf8-split-into-titles\"\nwordlists = [\"2016-12-07-SDA-last-names.txt\", \n \"2016-12-07-SDA-place-names.txt\", \n \"2016-12-08-SDA-Vocabulary.txt\", \n \"2017-01-03-place-names.txt\", \n \"2016-12-06-First-Word-List.txt\"]\nwordlist_dir = \"/Users/jeriwieringa/Dissertation/drafts/data/word-lists\"\nout_dir = \"/Users/jeriwieringa/Dissertation/drafts/data/corpus-statistics/2017-01-08\"",
"execution_count": 3,
"outputs": []
},
{
"metadata": {},
"cell_type": "markdown",
"source": "# Generate Data"
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-13T08:55:59.900116",
"end_time": "2017-01-13T08:55:59.908780"
},
"scrolled": false,
"trusted": true,
"collapsed": false
},
"cell_type": "code",
"source": "def process_title(title, input_dir):\n corpus_data = {}\n\n print(\"\\nProcessing: {}\".format(title))\n data = get_corpus_data(input_dir, title)\n\n corpus_data.update({\"reports\": data})\n yield corpus_data",
"execution_count": 4,
"outputs": []
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-13T08:55:59.910887",
"end_time": "2017-01-13T10:59:07.409688"
},
"collapsed": false,
"trusted": true
},
"cell_type": "code",
"source": "import time\n\nfor title in titles:\n outfile = \"{}-all-corpora-data.json\".format(title)\n with open(join(out_dir, outfile), \"w\", encoding='utf-8') as o: \n start = time.time()\n all_data = process_title(title, input_dir)\n for data in all_data:\n json.dump(data, o)\n end = time.time()\n print((end-start)/60)\n o.close()",
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"text": "\nProcessing: ADV\n1.4443191488583882\n\nProcessing: AmSn\n4.08760058482488\n\nProcessing: ARAI\n0.07371393044789633\n\nProcessing: CE\n2.1874577522277834\n\nProcessing: CUV\n3.9534137805302936\n\nProcessing: EDU\n0.2939378341039022\n\nProcessing: GCB\n2.774425919850667\n\nProcessing: GH\n1.2899338324864706\n\nProcessing: GOH\n0.41394771734873453\n\nProcessing: GS\n0.39861186742782595\n\nProcessing: HM\n1.5764355142911275\n\nProcessing: HR\n12.324441564083099\n\nProcessing: IR\n0.8139488180478414\n\nProcessing: LB\n5.083624283472697\n\nProcessing: LH\n5.361722048123678\n\nProcessing: LibM\n1.7180917501449584\n\nProcessing: LUH\n3.9271430810292562\n\nProcessing: NMN\n0.24592998425165813\n\nProcessing: PHJ\n2.608540717760722\n\nProcessing: PTAR\n0.14221384922663372\n\nProcessing: PUR\n3.9020716190338134\n\nProcessing: RH\n35.2963091691335\n\nProcessing: Sligo\n0.7368343830108642\n\nProcessing: SOL\n1.4018923163414\n\nProcessing: ST\n13.384035801887512\n\nProcessing: SUW\n3.6700123151143393\n\nProcessing: TCOG\n0.9790529688199361\n\nProcessing: TMM\n1.3713114182154338\n\nProcessing: WMH\n0.6506821990013123\n\nProcessing: YI\n11.000955033302308\n",
"name": "stdout"
}
]
},
{
"metadata": {
"ExecuteTime": {
"start_time": "2017-01-13T10:59:07.411479",
"end_time": "2017-01-13T10:59:08.888726"
},
"collapsed": false,
"trusted": true
},
"cell_type": "code",
"source": "# %load /Users/jeriwieringa/Dissertation/drafts/code/shared_elements/system_info.py\nimport IPython\nprint (IPython.sys_info())\n!pip freeze",
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"text": "{'commit_hash': '5c9c918',\n 'commit_source': 'installation',\n 'default_encoding': 'UTF-8',\n 'ipython_path': '/Users/jeriwieringa/miniconda3/envs/dissertation2/lib/python3.5/site-packages/IPython',\n 'ipython_version': '5.1.0',\n 'os_name': 'posix',\n 'platform': 'Darwin-16.3.0-x86_64-i386-64bit',\n 'sys_executable': '/Users/jeriwieringa/miniconda3/envs/dissertation2/bin/python',\n 'sys_platform': 'darwin',\n 'sys_version': '3.5.2 |Continuum Analytics, Inc.| (default, Jul 2 2016, '\n '17:52:12) \\n'\n '[GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]'}\nanaconda-client==1.5.5\nappnope==0.1.0\nargh==0.26.1\nblinker==1.4\nbokeh==0.12.4\nboto==2.43.0\nbz2file==0.98\nchest==0.2.3\ncloudpickle==0.2.1\nclyent==1.2.2\ndask==0.12.0\ndatashader==0.4.0\ndatashape==0.5.2\ndecorator==4.0.10\ndocutils==0.12\ndoit==0.29.0\ngensim==0.12.4\nGhost.py==0.2.3\nghp-import2==1.0.1\ngspread==0.4.1\nHeapDict==1.0.0\nhttplib2==0.9.2\nhusl==4.0.3\nijson==2.3\nipykernel==4.5.2\nipython==5.1.0\nipython-genutils==0.1.0\nipywidgets==5.2.2\nJinja2==2.8\njsonschema==2.5.1\njupyter==1.0.0\njupyter-client==4.4.0\njupyter-console==5.0.0\njupyter-contrib-core==0.3.0\njupyter-contrib-nbextensions==0.2.2\njupyter-core==4.2.1\njupyter-highlight-selected-word==0.0.5\njupyter-latex-envs==1.3.5.4\njupyter-nbextensions-configurator==0.2.3\nllvmlite==0.14.0\nlocket==0.2.0\nLogbook==1.0.0\nlxml==3.5.0\nMacFSEvents==0.7\nMako==1.0.4\nMarkdown==2.6.7\nMarkupSafe==0.23\nmemory-profiler==0.43\nmistune==0.7.3\nmultipledispatch==0.4.9\nnatsort==4.0.4\nnb-anacondacloud==1.2.0\nnb-conda==2.0.0\nnb-conda-kernels==2.0.0\nnb-config-manager==0.1.3\nnbbrowserpdf==0.2.1\nnbconvert==4.2.0\nnbformat==4.2.0\nnbpresent==3.0.2\nnetworkx==1.11\nNikola==7.7.7\nnltk==3.2.2\nnotebook==4.2.3\nnumba==0.29.0\nnumpy==1.11.3\noauth2client==4.0.0\nodo==0.5.0\npandas==0.19.2\npartd==0.3.6\npath.py==0.0.0\npathtools==0.1.2\npexpect==4.0.1\npickleshare==0.7.4\nPillow==3.4.2\nprompt-toolkit==1.0.9\npsutil==4.3.0\nptyprocess==0.5.1\npyasn1==0.1.9\npyasn1-modules==0.0.8\npycrypto==2.6.1\nPygments==2.1.3\nPyPDF2==1.25.1\nPyRSS2Gen==1.1\npyshp==1.2.10\npython-dateutil==2.6.0\npytz==2016.10\nPyYAML==3.12\npyzmq==16.0.2\nqtconsole==4.2.1\nrequests==2.12.3\nrsa==3.4.2\nscipy==0.18.1\nsimplegeneric==0.8.1\nsix==1.10.0\nsmart-open==1.3.5\nterminado==0.6\ntextblob==0.11.1\ntoolz==0.8.1\ntornado==4.4.2\ntraitlets==4.3.1\nUnidecode==0.4.19\nverifyOCR==0.1\nwatchdog==0.8.3\nwcwidth==0.1.7\nwebassets==0.11.1\nwidgetsnbextension==1.2.6\nws4py==0.3.4\nxarray==0.8.2\nYapsy==1.11.223\n",
"name": "stdout"
}
]
},
{
"metadata": {
"collapsed": true,
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"toc": {
"sideBar": false,
"toc_cell": true,
"nav_menu": {
"width": "252px",
"height": "102px"
},
"number_sections": true,
"threshold": "3",
"navigate_menu": true,
"toc_section_display": "block",
"toc_window_display": true
},
"anaconda-cloud": {},
"_draft": {
"nbviewer_url": "https://gist.github.com/97eeac0bf83365af7fd00bc6a0151554"
},
"gist": {
"id": "97eeac0bf83365af7fd00bc6a0151554",
"data": {
"description": "drafts/code/ocr_evaluation_and_correction/2017-01-06-corpus-OCR-overview-data.ipynb update",
"public": true
}
},
"language_info": {
"pygments_lexer": "ipython3",
"file_extension": ".py",
"version": "3.5.2",
"mimetype": "text/x-python",
"name": "python",
"codemirror_mode": {
"version": 3,
"name": "ipython"
},
"nbconvert_exporter": "python"
},
"kernelspec": {
"language": "python",
"display_name": "Python [conda env:dissertation2]",
"name": "conda-env-dissertation2-py"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment