Skip to content

Instantly share code, notes, and snippets.

@y-o-u
Created December 9, 2021 06:47
Show Gist options
  • Save y-o-u/a1b1e4af34a7ea8c6e1eb105a8c65f10 to your computer and use it in GitHub Desktop.
Save y-o-u/a1b1e4af34a7ea8c6e1eb105a8c65f10 to your computer and use it in GitHub Desktop.
Untitled0.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Untitled0.ipynb",
"provenance": [],
"authorship_tag": "ABX9TyPI9wTSkjW7X+tGON8x3hfB",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/y-o-u/a1b1e4af34a7ea8c6e1eb105a8c65f10/untitled0.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"source": [
"!apt install tesseract-ocr libtesseract-dev poppler-utils tesseract-ocr-jpn"
],
"metadata": {
"id": "1l_rjgS05wZn"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "zeHP-vw-5VZw"
},
"outputs": [],
"source": [
"pip install pyocr"
]
},
{
"cell_type": "code",
"source": [
"pip install pdf2image"
],
"metadata": {
"id": "1Is7oP3y59Di"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"from PIL import Image\n",
"import sys\n",
"import pyocr\n",
"import pyocr.builders\n",
"import pdf2image\n",
"\n",
"tools = pyocr.get_available_tools()\n",
"if len(tools) == 0:\n",
" print(\"No OCR tool found\")\n",
" sys.exit(1)\n",
"\n",
"# The tools are returned in the recommended order of usage\n",
"tool = tools[0]\n",
"\n",
"images = pdf2image.convert_from_path(\"000667876.pdf\", dpi=200, fmt='jpg')\n",
"#lang = 'eng'\n",
"lang = 'jpn'\n",
"\n",
"# 画像オブジェクトからテキストに\n",
"##for image in images:\n",
"## txt = tool.image_to_string(\n",
"## image,\n",
"## lang=lang,\n",
"## builder=pyocr.builders.TextBuilder()\n",
"## )\n",
" \n",
"txt = tool.image_to_string(\n",
" images[3],\n",
" lang=lang,\n",
" builder=pyocr.builders.TextBuilder()\n",
" )\n",
"\n",
"print(txt)\n",
"exit()"
],
"metadata": {
"id": "W3hZF6Jn6B9z"
},
"execution_count": null,
"outputs": []
}
]
}
@Mgloria91
Copy link

Thank you

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment