Last active
August 4, 2018 12:53
-
-
Save wasi0013/ad6a5d3faab911eee1b998897b57b15d to your computer and use it in GitHub Desktop.
Solves basic alpha numeric captchas using pytesseract and PIL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Solves basic alpha numeric captchas using pytesseract and PIL | |
NOTE: This program is written for testing difficulties of captchas generated by captcha generator. | |
PLEASE, DO NOT USE IT FOR SPAMMING OR, ABUSING SYSTEMS! | |
Usage: | |
>>> get_captcha_text("https://i.imgur.com/4u7PESk.png") | |
'hWA K n h' | |
It is also possible to use proxy i.e: | |
>>> proxies = {"http": "http://104.236.13.100:8888", "https": "http://104.236.13.100:8888"} | |
>>> get_captcha_text('https://i.imgur.com/4u7PESk.png', proxies=proxies) | |
'hWA K n h' | |
""" | |
__auth__ = "wasi0013" | |
import os | |
try: | |
import requests | |
except: | |
os.system('pip install requests') | |
# also requires Google's Tesseract-OCR installed and, `tesseract` command must be available on terminal/cmd. | |
try: | |
import pytesseract | |
except: | |
os.system('pip install pytesseract') | |
from PIL import Image, ImageEnhance, ImageFilter | |
def clean_image(captcha_image, chop = 8): | |
""" | |
clean_image(PIL_IMAGE, [optionally chop]) | |
takes a PIL_IMAGE and, removes all the stripes & grids using | |
the optional parameter `chop` | |
the default value of chop is set to 8 | |
""" | |
width, height = captcha_image.size | |
data = captcha_image.load() | |
for y in range(height): | |
for x in range(width): | |
if data[x, y] > 128: | |
continue | |
total = 0 | |
for c in range(x, width): | |
if data[c, y] < 128: | |
total += 1 | |
else: | |
break | |
if total <= chop: | |
for c in range(total): | |
data[x + c, y] = 255 | |
x += total | |
for x in range(width): | |
for y in range(height): | |
if data[x, y] > 128: | |
continue | |
total = 0 | |
for c in range(y, height): | |
if data[x, c] < 128: | |
total += 1 | |
else: | |
break | |
if total <= chop: | |
for c in range(total): | |
data[x, y + c] = 255 | |
y += total | |
return True | |
def get_captcha_text(url, proxies = None): | |
""" | |
get_captcha_test(url) -> str | |
takes a captcha image url loads it as PIL Image applies various filters & call's clean_image method to remove all the stripes | |
& grids (if any left) | |
returns text from the image. | |
""" | |
# proxies = {"http": "http://104.236.13.100:8888", | |
# "https": "http://104.236.13.100:8888"} | |
image = None | |
if proxies: | |
image = requests.get(url, proxies=proxies, stream=True).raw | |
else: | |
image = requests.get(url, stream=True).raw | |
captcha_image = Image.open(image) | |
captcha_image = captcha_image.resize((1000, 200), Image.ANTIALIAS) | |
# captcha_image.save('captcha.png') | |
for i in range(5): | |
captcha_image = captcha_image.filter(ImageFilter.MedianFilter()) | |
enhancer = ImageEnhance.Contrast(captcha_image) | |
captcha_image = enhancer.enhance(9) | |
# captcha_image.save('crap2.png') | |
captcha_image = captcha_image.convert('1') | |
# captcha_image.save('converted.png') | |
clean_image(captcha_image) | |
captcha_image = captcha_image.convert('RGBA') | |
enhancer = ImageEnhance.Sharpness(captcha_image) | |
captcha_image = enhancer.enhance(0) | |
# captcha_image.save('sharp.png') | |
enhancer = ImageEnhance.Contrast(captcha_image) | |
captcha_image = enhancer.enhance(4) | |
captcha_image = captcha_image.convert('1') | |
clean_image(captcha_image) | |
captcha_image.save('captcha.png') | |
text = pytesseract.image_to_string(Image.open('captcha.png')) | |
text = "".join(i for i in text if i.isalnum() or i == " ") | |
return text | |
url = "https://i.imgur.com/4u7PESk.png" | |
print(get_captcha_text(url)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Sample url: https://i.imgur.com/4u7PESk.png
Sample output: hWA K n h