Created
December 12, 2022 12:58
-
-
Save x1001000/3961575e890712e053619f492956ca36 to your computer and use it in GitHub Desktop.
EyeCanSee 0 to OCRmyPDF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"name": "EyeCanSee 0 to OCRmyPDF", | |
"authorship_tag": "ABX9TyPMa0B55Ei18CNEzBu2DJvA", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/x1001000/3961575e890712e053619f492956ca36/eyecansee-0-to-ocrmypdf.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!apt update\n", | |
"!apt install tesseract-ocr\n", | |
"!apt install ghostscript\n", | |
"!pip install ocrmypdf --quiet\n", | |
"%cd /usr/local/lib/python3.8/dist-packages/ocrmypdf/subprocess\n", | |
"!sed -i \"346a \\ \\ \\ \\ \\ \\ \\ \\ \\ \\ \\ \\ pass\" __init__.py\n", | |
"!sed -i \"346d\" __init__.py\n", | |
"exit() # to restart runtime" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "etffOoGx5kZf", | |
"outputId": "7625fff3-0815-4d25-f7f8-9d097ef7ac62" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"\u001b[33m\r0% [Working]\u001b[0m\r \rIgn:1 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 InRelease\n", | |
"\u001b[33m\r0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Waiting for headers] [W\u001b[0m\r \rGet:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]\n", | |
"\u001b[33m\r0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Waiting for headers] [2\u001b[0m\u001b[33m\r0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Waiting for headers] [W\u001b[0m\r \rGet:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 InRelease [1,581 B]\n", | |
"\u001b[33m\r0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Waiting for headers] [C\u001b[0m\u001b[33m\r0% [2 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (185.125.190.39\u001b[0m\r \rHit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64 Release\n", | |
"Get:5 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]\n", | |
"Hit:6 http://archive.ubuntu.com/ubuntu bionic InRelease\n", | |
"Get:7 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]\n", | |
"Hit:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease\n", | |
"Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64 Packages [1,073 kB]\n", | |
"Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease\n", | |
"Get:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [83.3 kB]\n", | |
"Hit:12 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease\n", | |
"Hit:14 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease\n", | |
"Get:15 http://security.ubuntu.com/ubuntu bionic-security/universe amd64 Packages [1,567 kB]\n", | |
"Get:16 http://security.ubuntu.com/ubuntu bionic-security/restricted amd64 Packages [1,311 kB]\n", | |
"Get:17 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Packages [3,099 kB]\n", | |
"Get:18 http://archive.ubuntu.com/ubuntu bionic-updates/restricted amd64 Packages [1,352 kB]\n", | |
"Get:19 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 Packages [2,342 kB]\n", | |
"Get:20 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 Packages [3,524 kB]\n", | |
"Fetched 14.5 MB in 6s (2,272 kB/s)\n", | |
"Reading package lists... Done\n", | |
"Building dependency tree \n", | |
"Reading state information... Done\n", | |
"34 packages can be upgraded. Run 'apt list --upgradable' to see them.\n", | |
"Reading package lists... Done\n", | |
"Building dependency tree \n", | |
"Reading state information... Done\n", | |
"The following package was automatically installed and is no longer required:\n", | |
" libnvidia-common-460\n", | |
"Use 'apt autoremove' to remove it.\n", | |
"The following additional packages will be installed:\n", | |
" tesseract-ocr-eng tesseract-ocr-osd\n", | |
"The following NEW packages will be installed:\n", | |
" tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd\n", | |
"0 upgraded, 3 newly installed, 0 to remove and 34 not upgraded.\n", | |
"Need to get 4,795 kB of archives.\n", | |
"After this operation, 15.8 MB of additional disk space will be used.\n", | |
"Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-eng all 4.00~git24-0e00fe6-1.2 [1,588 kB]\n", | |
"Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-osd all 4.00~git24-0e00fe6-1.2 [2,989 kB]\n", | |
"Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr amd64 4.00~git2288-10f4998a-2 [218 kB]\n", | |
"Fetched 4,795 kB in 2s (2,974 kB/s)\n", | |
"Selecting previously unselected package tesseract-ocr-eng.\n", | |
"(Reading database ... 124013 files and directories currently installed.)\n", | |
"Preparing to unpack .../tesseract-ocr-eng_4.00~git24-0e00fe6-1.2_all.deb ...\n", | |
"Unpacking tesseract-ocr-eng (4.00~git24-0e00fe6-1.2) ...\n", | |
"Selecting previously unselected package tesseract-ocr-osd.\n", | |
"Preparing to unpack .../tesseract-ocr-osd_4.00~git24-0e00fe6-1.2_all.deb ...\n", | |
"Unpacking tesseract-ocr-osd (4.00~git24-0e00fe6-1.2) ...\n", | |
"Selecting previously unselected package tesseract-ocr.\n", | |
"Preparing to unpack .../tesseract-ocr_4.00~git2288-10f4998a-2_amd64.deb ...\n", | |
"Unpacking tesseract-ocr (4.00~git2288-10f4998a-2) ...\n", | |
"Setting up tesseract-ocr-osd (4.00~git24-0e00fe6-1.2) ...\n", | |
"Setting up tesseract-ocr-eng (4.00~git24-0e00fe6-1.2) ...\n", | |
"Setting up tesseract-ocr (4.00~git2288-10f4998a-2) ...\n", | |
"Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n", | |
"Reading package lists... Done\n", | |
"Building dependency tree \n", | |
"Reading state information... Done\n", | |
"The following package was automatically installed and is no longer required:\n", | |
" libnvidia-common-460\n", | |
"Use 'apt autoremove' to remove it.\n", | |
"The following additional packages will be installed:\n", | |
" fonts-droid-fallback fonts-noto-mono gsfonts libcupsfilters1 libcupsimage2\n", | |
" libgs9 libgs9-common libijs-0.35 libjbig2dec0 poppler-data\n", | |
"Suggested packages:\n", | |
" fonts-noto ghostscript-x poppler-utils fonts-japanese-mincho\n", | |
" | fonts-ipafont-mincho fonts-japanese-gothic | fonts-ipafont-gothic\n", | |
" fonts-arphic-ukai fonts-arphic-uming fonts-nanum\n", | |
"The following NEW packages will be installed:\n", | |
" fonts-droid-fallback fonts-noto-mono ghostscript gsfonts libcupsfilters1\n", | |
" libcupsimage2 libgs9 libgs9-common libijs-0.35 libjbig2dec0 poppler-data\n", | |
"0 upgraded, 11 newly installed, 0 to remove and 34 not upgraded.\n", | |
"Need to get 14.1 MB of archives.\n", | |
"After this operation, 49.9 MB of additional disk space will be used.\n", | |
"Get:1 http://archive.ubuntu.com/ubuntu bionic/main amd64 fonts-droid-fallback all 1:6.0.1r16-1.1 [1,805 kB]\n", | |
"Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 poppler-data all 0.4.8-2 [1,479 kB]\n", | |
"Get:3 http://archive.ubuntu.com/ubuntu bionic/main amd64 fonts-noto-mono all 20171026-2 [75.5 kB]\n", | |
"Get:4 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libcupsimage2 amd64 2.2.7-1ubuntu2.9 [18.6 kB]\n", | |
"Get:5 http://archive.ubuntu.com/ubuntu bionic/main amd64 libijs-0.35 amd64 0.35-13 [15.5 kB]\n", | |
"Get:6 http://archive.ubuntu.com/ubuntu bionic/main amd64 libjbig2dec0 amd64 0.13-6 [55.9 kB]\n", | |
"Get:7 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libgs9-common all 9.26~dfsg+0-0ubuntu0.18.04.17 [5,092 kB]\n", | |
"Get:8 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libgs9 amd64 9.26~dfsg+0-0ubuntu0.18.04.17 [2,267 kB]\n", | |
"Get:9 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 ghostscript amd64 9.26~dfsg+0-0ubuntu0.18.04.17 [51.3 kB]\n", | |
"Get:10 http://archive.ubuntu.com/ubuntu bionic/main amd64 gsfonts all 1:8.11+urwcyr1.0.7~pre44-4.4 [3,120 kB]\n", | |
"Get:11 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libcupsfilters1 amd64 1.20.2-0ubuntu3.1 [108 kB]\n", | |
"Fetched 14.1 MB in 3s (4,349 kB/s)\n", | |
"Selecting previously unselected package fonts-droid-fallback.\n", | |
"(Reading database ... 124060 files and directories currently installed.)\n", | |
"Preparing to unpack .../00-fonts-droid-fallback_1%3a6.0.1r16-1.1_all.deb ...\n", | |
"Unpacking fonts-droid-fallback (1:6.0.1r16-1.1) ...\n", | |
"Selecting previously unselected package poppler-data.\n", | |
"Preparing to unpack .../01-poppler-data_0.4.8-2_all.deb ...\n", | |
"Unpacking poppler-data (0.4.8-2) ...\n", | |
"Selecting previously unselected package fonts-noto-mono.\n", | |
"Preparing to unpack .../02-fonts-noto-mono_20171026-2_all.deb ...\n", | |
"Unpacking fonts-noto-mono (20171026-2) ...\n", | |
"Selecting previously unselected package libcupsimage2:amd64.\n", | |
"Preparing to unpack .../03-libcupsimage2_2.2.7-1ubuntu2.9_amd64.deb ...\n", | |
"Unpacking libcupsimage2:amd64 (2.2.7-1ubuntu2.9) ...\n", | |
"Selecting previously unselected package libijs-0.35:amd64.\n", | |
"Preparing to unpack .../04-libijs-0.35_0.35-13_amd64.deb ...\n", | |
"Unpacking libijs-0.35:amd64 (0.35-13) ...\n", | |
"Selecting previously unselected package libjbig2dec0:amd64.\n", | |
"Preparing to unpack .../05-libjbig2dec0_0.13-6_amd64.deb ...\n", | |
"Unpacking libjbig2dec0:amd64 (0.13-6) ...\n", | |
"Selecting previously unselected package libgs9-common.\n", | |
"Preparing to unpack .../06-libgs9-common_9.26~dfsg+0-0ubuntu0.18.04.17_all.deb ...\n", | |
"Unpacking libgs9-common (9.26~dfsg+0-0ubuntu0.18.04.17) ...\n", | |
"Selecting previously unselected package libgs9:amd64.\n", | |
"Preparing to unpack .../07-libgs9_9.26~dfsg+0-0ubuntu0.18.04.17_amd64.deb ...\n", | |
"Unpacking libgs9:amd64 (9.26~dfsg+0-0ubuntu0.18.04.17) ...\n", | |
"Selecting previously unselected package ghostscript.\n", | |
"Preparing to unpack .../08-ghostscript_9.26~dfsg+0-0ubuntu0.18.04.17_amd64.deb ...\n", | |
"Unpacking ghostscript (9.26~dfsg+0-0ubuntu0.18.04.17) ...\n", | |
"Selecting previously unselected package gsfonts.\n", | |
"Preparing to unpack .../09-gsfonts_1%3a8.11+urwcyr1.0.7~pre44-4.4_all.deb ...\n", | |
"Unpacking gsfonts (1:8.11+urwcyr1.0.7~pre44-4.4) ...\n", | |
"Selecting previously unselected package libcupsfilters1:amd64.\n", | |
"Preparing to unpack .../10-libcupsfilters1_1.20.2-0ubuntu3.1_amd64.deb ...\n", | |
"Unpacking libcupsfilters1:amd64 (1.20.2-0ubuntu3.1) ...\n", | |
"Setting up libgs9-common (9.26~dfsg+0-0ubuntu0.18.04.17) ...\n", | |
"Setting up fonts-droid-fallback (1:6.0.1r16-1.1) ...\n", | |
"Setting up gsfonts (1:8.11+urwcyr1.0.7~pre44-4.4) ...\n", | |
"Setting up poppler-data (0.4.8-2) ...\n", | |
"Setting up fonts-noto-mono (20171026-2) ...\n", | |
"Setting up libcupsfilters1:amd64 (1.20.2-0ubuntu3.1) ...\n", | |
"Setting up libcupsimage2:amd64 (2.2.7-1ubuntu2.9) ...\n", | |
"Setting up libjbig2dec0:amd64 (0.13-6) ...\n", | |
"Setting up libijs-0.35:amd64 (0.35-13) ...\n", | |
"Setting up libgs9:amd64 (9.26~dfsg+0-0ubuntu0.18.04.17) ...\n", | |
"Setting up ghostscript (9.26~dfsg+0-0ubuntu0.18.04.17) ...\n", | |
"Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n", | |
"Processing triggers for fontconfig (2.12.6-0ubuntu2) ...\n", | |
"Processing triggers for libc-bin (2.27-3ubuntu1.6) ...\n", | |
"\u001b[K |████████████████████████████████| 122 kB 8.6 MB/s \n", | |
"\u001b[K |████████████████████████████████| 97 kB 7.7 MB/s \n", | |
"\u001b[K |████████████████████████████████| 5.6 MB 26.8 MB/s \n", | |
"\u001b[K |████████████████████████████████| 46 kB 3.4 MB/s \n", | |
"\u001b[K |████████████████████████████████| 2.6 MB 57.2 MB/s \n", | |
"\u001b[K |████████████████████████████████| 2.8 MB 53.8 MB/s \n", | |
"\u001b[K |████████████████████████████████| 3.2 MB 54.6 MB/s \n", | |
"\u001b[K |████████████████████████████████| 86 kB 6.1 MB/s \n", | |
"\u001b[K |████████████████████████████████| 4.0 MB 53.7 MB/s \n", | |
"\u001b[?25h Building wheel for img2pdf (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", | |
"pytest 3.6.4 requires pluggy<0.8,>=0.5, but you have pluggy 1.0.0 which is incompatible.\u001b[0m\n", | |
"/usr/local/lib/python3.8/dist-packages/ocrmypdf/subprocess\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!pip install PyPDF2 --quiet" | |
], | |
"metadata": { | |
"id": "LzRTcX0LQ_Su", | |
"outputId": "cd70c7ee-be96-4ad5-a2af-759e1e90bdc7", | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
} | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"\u001b[?25l\r\u001b[K |█▌ | 10 kB 22.4 MB/s eta 0:00:01\r\u001b[K |███ | 20 kB 13.3 MB/s eta 0:00:01\r\u001b[K |████▍ | 30 kB 17.8 MB/s eta 0:00:01\r\u001b[K |█████▉ | 40 kB 6.6 MB/s eta 0:00:01\r\u001b[K |███████▍ | 51 kB 5.5 MB/s eta 0:00:01\r\u001b[K |████████▉ | 61 kB 6.5 MB/s eta 0:00:01\r\u001b[K |██████████▎ | 71 kB 7.2 MB/s eta 0:00:01\r\u001b[K |███████████▊ | 81 kB 5.8 MB/s eta 0:00:01\r\u001b[K |█████████████▎ | 92 kB 6.5 MB/s eta 0:00:01\r\u001b[K |██████████████▊ | 102 kB 5.9 MB/s eta 0:00:01\r\u001b[K |████████████████▏ | 112 kB 5.9 MB/s eta 0:00:01\r\u001b[K |█████████████████▋ | 122 kB 5.9 MB/s eta 0:00:01\r\u001b[K |███████████████████▏ | 133 kB 5.9 MB/s eta 0:00:01\r\u001b[K |████████████████████▋ | 143 kB 5.9 MB/s eta 0:00:01\r\u001b[K |██████████████████████ | 153 kB 5.9 MB/s eta 0:00:01\r\u001b[K |███████████████████████▌ | 163 kB 5.9 MB/s eta 0:00:01\r\u001b[K |█████████████████████████ | 174 kB 5.9 MB/s eta 0:00:01\r\u001b[K |██████████████████████████▌ | 184 kB 5.9 MB/s eta 0:00:01\r\u001b[K |████████████████████████████ | 194 kB 5.9 MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▍ | 204 kB 5.9 MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▉ | 215 kB 5.9 MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 222 kB 5.9 MB/s \n", | |
"\u001b[?25h" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from google.colab import files\n", | |
"assert files.upload()" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 107 | |
}, | |
"id": "R4PXpI3gs2H0", | |
"outputId": "56ba8234-1366-4143-8254-191a34e016a4" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "display_data", | |
"data": { | |
"text/plain": [ | |
"<IPython.core.display.HTML object>" | |
], | |
"text/html": [ | |
"\n", | |
" <input type=\"file\" id=\"files-3e90de50-0006-4174-87e8-8d0974b8a533\" name=\"files[]\" multiple disabled\n", | |
" style=\"border:none\" />\n", | |
" <output id=\"result-3e90de50-0006-4174-87e8-8d0974b8a533\">\n", | |
" Upload widget is only available when the cell has been executed in the\n", | |
" current browser session. Please rerun this cell to enable.\n", | |
" </output>\n", | |
" <script>// Copyright 2017 Google LLC\n", | |
"//\n", | |
"// Licensed under the Apache License, Version 2.0 (the \"License\");\n", | |
"// you may not use this file except in compliance with the License.\n", | |
"// You may obtain a copy of the License at\n", | |
"//\n", | |
"// http://www.apache.org/licenses/LICENSE-2.0\n", | |
"//\n", | |
"// Unless required by applicable law or agreed to in writing, software\n", | |
"// distributed under the License is distributed on an \"AS IS\" BASIS,\n", | |
"// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", | |
"// See the License for the specific language governing permissions and\n", | |
"// limitations under the License.\n", | |
"\n", | |
"/**\n", | |
" * @fileoverview Helpers for google.colab Python module.\n", | |
" */\n", | |
"(function(scope) {\n", | |
"function span(text, styleAttributes = {}) {\n", | |
" const element = document.createElement('span');\n", | |
" element.textContent = text;\n", | |
" for (const key of Object.keys(styleAttributes)) {\n", | |
" element.style[key] = styleAttributes[key];\n", | |
" }\n", | |
" return element;\n", | |
"}\n", | |
"\n", | |
"// Max number of bytes which will be uploaded at a time.\n", | |
"const MAX_PAYLOAD_SIZE = 100 * 1024;\n", | |
"\n", | |
"function _uploadFiles(inputId, outputId) {\n", | |
" const steps = uploadFilesStep(inputId, outputId);\n", | |
" const outputElement = document.getElementById(outputId);\n", | |
" // Cache steps on the outputElement to make it available for the next call\n", | |
" // to uploadFilesContinue from Python.\n", | |
" outputElement.steps = steps;\n", | |
"\n", | |
" return _uploadFilesContinue(outputId);\n", | |
"}\n", | |
"\n", | |
"// This is roughly an async generator (not supported in the browser yet),\n", | |
"// where there are multiple asynchronous steps and the Python side is going\n", | |
"// to poll for completion of each step.\n", | |
"// This uses a Promise to block the python side on completion of each step,\n", | |
"// then passes the result of the previous step as the input to the next step.\n", | |
"function _uploadFilesContinue(outputId) {\n", | |
" const outputElement = document.getElementById(outputId);\n", | |
" const steps = outputElement.steps;\n", | |
"\n", | |
" const next = steps.next(outputElement.lastPromiseValue);\n", | |
" return Promise.resolve(next.value.promise).then((value) => {\n", | |
" // Cache the last promise value to make it available to the next\n", | |
" // step of the generator.\n", | |
" outputElement.lastPromiseValue = value;\n", | |
" return next.value.response;\n", | |
" });\n", | |
"}\n", | |
"\n", | |
"/**\n", | |
" * Generator function which is called between each async step of the upload\n", | |
" * process.\n", | |
" * @param {string} inputId Element ID of the input file picker element.\n", | |
" * @param {string} outputId Element ID of the output display.\n", | |
" * @return {!Iterable<!Object>} Iterable of next steps.\n", | |
" */\n", | |
"function* uploadFilesStep(inputId, outputId) {\n", | |
" const inputElement = document.getElementById(inputId);\n", | |
" inputElement.disabled = false;\n", | |
"\n", | |
" const outputElement = document.getElementById(outputId);\n", | |
" outputElement.innerHTML = '';\n", | |
"\n", | |
" const pickedPromise = new Promise((resolve) => {\n", | |
" inputElement.addEventListener('change', (e) => {\n", | |
" resolve(e.target.files);\n", | |
" });\n", | |
" });\n", | |
"\n", | |
" const cancel = document.createElement('button');\n", | |
" inputElement.parentElement.appendChild(cancel);\n", | |
" cancel.textContent = 'Cancel upload';\n", | |
" const cancelPromise = new Promise((resolve) => {\n", | |
" cancel.onclick = () => {\n", | |
" resolve(null);\n", | |
" };\n", | |
" });\n", | |
"\n", | |
" // Wait for the user to pick the files.\n", | |
" const files = yield {\n", | |
" promise: Promise.race([pickedPromise, cancelPromise]),\n", | |
" response: {\n", | |
" action: 'starting',\n", | |
" }\n", | |
" };\n", | |
"\n", | |
" cancel.remove();\n", | |
"\n", | |
" // Disable the input element since further picks are not allowed.\n", | |
" inputElement.disabled = true;\n", | |
"\n", | |
" if (!files) {\n", | |
" return {\n", | |
" response: {\n", | |
" action: 'complete',\n", | |
" }\n", | |
" };\n", | |
" }\n", | |
"\n", | |
" for (const file of files) {\n", | |
" const li = document.createElement('li');\n", | |
" li.append(span(file.name, {fontWeight: 'bold'}));\n", | |
" li.append(span(\n", | |
" `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n", | |
" `last modified: ${\n", | |
" file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n", | |
" 'n/a'} - `));\n", | |
" const percent = span('0% done');\n", | |
" li.appendChild(percent);\n", | |
"\n", | |
" outputElement.appendChild(li);\n", | |
"\n", | |
" const fileDataPromise = new Promise((resolve) => {\n", | |
" const reader = new FileReader();\n", | |
" reader.onload = (e) => {\n", | |
" resolve(e.target.result);\n", | |
" };\n", | |
" reader.readAsArrayBuffer(file);\n", | |
" });\n", | |
" // Wait for the data to be ready.\n", | |
" let fileData = yield {\n", | |
" promise: fileDataPromise,\n", | |
" response: {\n", | |
" action: 'continue',\n", | |
" }\n", | |
" };\n", | |
"\n", | |
" // Use a chunked sending to avoid message size limits. See b/62115660.\n", | |
" let position = 0;\n", | |
" do {\n", | |
" const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n", | |
" const chunk = new Uint8Array(fileData, position, length);\n", | |
" position += length;\n", | |
"\n", | |
" const base64 = btoa(String.fromCharCode.apply(null, chunk));\n", | |
" yield {\n", | |
" response: {\n", | |
" action: 'append',\n", | |
" file: file.name,\n", | |
" data: base64,\n", | |
" },\n", | |
" };\n", | |
"\n", | |
" let percentDone = fileData.byteLength === 0 ?\n", | |
" 100 :\n", | |
" Math.round((position / fileData.byteLength) * 100);\n", | |
" percent.textContent = `${percentDone}% done`;\n", | |
"\n", | |
" } while (position < fileData.byteLength);\n", | |
" }\n", | |
"\n", | |
" // All done.\n", | |
" yield {\n", | |
" response: {\n", | |
" action: 'complete',\n", | |
" }\n", | |
" };\n", | |
"}\n", | |
"\n", | |
"scope.google = scope.google || {};\n", | |
"scope.google.colab = scope.google.colab || {};\n", | |
"scope.google.colab._files = {\n", | |
" _uploadFiles,\n", | |
" _uploadFilesContinue,\n", | |
"};\n", | |
"})(self);\n", | |
"</script> " | |
] | |
}, | |
"metadata": {} | |
}, | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Saving 131.pdf to 131.pdf\n", | |
"Saving EyeCanSee.pdf to EyeCanSee.pdf\n" | |
] | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from PyPDF2 import PdfReader, PdfWriter\n", | |
"reader = PdfReader('131.pdf')\n", | |
"EyeCanSee = PdfReader('EyeCanSee.pdf').pages[0]\n", | |
"writer = PdfWriter()\n", | |
"for page in reader.pages:\n", | |
" if EyeCanSee:\n", | |
" EyeCanSee.merge_page(page)\n", | |
" EyeCanSee.mediabox = page.mediabox\n", | |
" writer.add_page(EyeCanSee)\n", | |
" EyeCanSee = 0\n", | |
" else:\n", | |
" writer.add_page(page)\n", | |
"with open(\"131.EyeCanSee.pdf\", \"wb\") as fp:\n", | |
" writer.write(fp)" | |
], | |
"metadata": { | |
"id": "iToCoh0yjoLh" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"import ocrmypdf\n", | |
"ocrmypdf.ocr('131.EyeCanSee.pdf', '131.OCR.pdf', force_ocr=True)" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "JcDY2hNjmJJq", | |
"outputId": "650aa969-f0a8-46c0-cff6-1c1b57d375b3" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stderr", | |
"text": [ | |
"ERROR:ocrmypdf.subprocess:\n", | |
"OCRmyPDF requires 'tesseract' 4.1.1 or higher. Your system appears\n", | |
"to have 4.0.0-beta.1. Please update this program.\n", | |
"\n", | |
"ERROR:ocrmypdf.subprocess:\n", | |
"OCRmyPDF requires 'gs' 9.50 or higher. Your system appears\n", | |
"to have 9.26. Please update this program.\n", | |
"\n", | |
"Scanning contents: 100%|██████████| 3/3 [00:05<00:00, 1.88s/page]\n", | |
"OCR: 0%| | 0.0/3.0 [00:00<?, ?page/s]WARNING:ocrmypdf._pipeline:page has no images - all vector content will be rasterized at 400 DPI, losing some resolution and likely increasing file size. Use --oversample to adjust the DPI.\n", | |
"WARNING:ocrmypdf._pipeline:page has no images - all vector content will be rasterized at 400 DPI, losing some resolution and likely increasing file size. Use --oversample to adjust the DPI.\n", | |
"OCR: 100%|██████████| 3.0/3.0 [00:15<00:00, 5.17s/page]\n", | |
"PDF/A conversion: 100%|██████████| 3/3 [00:02<00:00, 1.03page/s]\n", | |
"Recompressing JPEGs: 0image [00:00, ?image/s]\n", | |
"Deflating JPEGs: 100%|██████████| 1/1 [00:00<00:00, 33.63image/s]\n", | |
"JBIG2: 0item [00:00, ?item/s]\n" | |
] | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [ | |
"<ExitCode.ok: 0>" | |
] | |
}, | |
"metadata": {}, | |
"execution_count": 3 | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment