Last active
October 20, 2020 10:32
-
-
Save gnilchee/a515cee4808896f350d4a9767c1d56d8 to your computer and use it in GitHub Desktop.
Convert captcha images so it can be read in tesseract
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# apt-get install -y imagemagick tesseract-ocr tesseract-ocr-eng python-pip | |
# pip install Image pytesseract | |
# python convert.py 2 captcha.png captcha-clean.png | |
# convert captcha-clean.png -resize 500 captcha-clean-big.png | |
# tesseract -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyz1234567890 -psm 7 captcha-clean-big.png output | |
# cat output.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
## pip install Image | |
## python convert.py float(1-3) input.png output.png | |
from PIL import Image | |
import sys | |
chomp = float(sys.argv[1]) | |
image = Image.open(sys.argv[2]).convert('1') | |
width, height = image.size | |
data = image.load() | |
for y in range(height): | |
for x in range(width): | |
if data[x, y] > 128: | |
continue | |
total = 0 | |
for c in range(x, width): | |
if data[c, y] < 128: | |
total += 1 | |
else: | |
break | |
if total <= chomp: | |
for c in range(total): | |
data[x + c, y] = 255 | |
x += total | |
for x in range(width): | |
for y in range(height): | |
if data[x, y] > 128: | |
continue | |
total = 0 | |
for c in range(y, height): | |
if data[x, c] < 128: | |
total += 1 | |
else: | |
break | |
if total <= chomp: | |
for c in range(total): | |
data[x, y + c] = 255 | |
y += total | |
image.save(sys.argv[3]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment