Created
December 9, 2021 06:47
-
-
Save y-o-u/a1b1e4af34a7ea8c6e1eb105a8c65f10 to your computer and use it in GitHub Desktop.
Untitled0.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Untitled0.ipynb", | |
"provenance": [], | |
"authorship_tag": "ABX9TyPI9wTSkjW7X+tGON8x3hfB", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/y-o-u/a1b1e4af34a7ea8c6e1eb105a8c65f10/untitled0.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"!apt install tesseract-ocr libtesseract-dev poppler-utils tesseract-ocr-jpn" | |
], | |
"metadata": { | |
"id": "1l_rjgS05wZn" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"id": "zeHP-vw-5VZw" | |
}, | |
"outputs": [], | |
"source": [ | |
"pip install pyocr" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"pip install pdf2image" | |
], | |
"metadata": { | |
"id": "1Is7oP3y59Di" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"from PIL import Image\n", | |
"import sys\n", | |
"import pyocr\n", | |
"import pyocr.builders\n", | |
"import pdf2image\n", | |
"\n", | |
"tools = pyocr.get_available_tools()\n", | |
"if len(tools) == 0:\n", | |
" print(\"No OCR tool found\")\n", | |
" sys.exit(1)\n", | |
"\n", | |
"# The tools are returned in the recommended order of usage\n", | |
"tool = tools[0]\n", | |
"\n", | |
"images = pdf2image.convert_from_path(\"000667876.pdf\", dpi=200, fmt='jpg')\n", | |
"#lang = 'eng'\n", | |
"lang = 'jpn'\n", | |
"\n", | |
"# 画像オブジェクトからテキストに\n", | |
"##for image in images:\n", | |
"## txt = tool.image_to_string(\n", | |
"## image,\n", | |
"## lang=lang,\n", | |
"## builder=pyocr.builders.TextBuilder()\n", | |
"## )\n", | |
" \n", | |
"txt = tool.image_to_string(\n", | |
" images[3],\n", | |
" lang=lang,\n", | |
" builder=pyocr.builders.TextBuilder()\n", | |
" )\n", | |
"\n", | |
"print(txt)\n", | |
"exit()" | |
], | |
"metadata": { | |
"id": "W3hZF6Jn6B9z" | |
}, | |
"execution_count": null, | |
"outputs": [] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you