Skip to content

Instantly share code, notes, and snippets.

@wasi0013
Last active August 4, 2018 12:53
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save wasi0013/ad6a5d3faab911eee1b998897b57b15d to your computer and use it in GitHub Desktop.
Save wasi0013/ad6a5d3faab911eee1b998897b57b15d to your computer and use it in GitHub Desktop.
Solves basic alpha numeric captchas using pytesseract and PIL
"""
Solves basic alpha numeric captchas using pytesseract and PIL
NOTE: This program is written for testing difficulties of captchas generated by captcha generator.
PLEASE, DO NOT USE IT FOR SPAMMING OR, ABUSING SYSTEMS!
Usage:
>>> get_captcha_text("https://i.imgur.com/4u7PESk.png")
'hWA K n h'
It is also possible to use proxy i.e:
>>> proxies = {"http": "http://104.236.13.100:8888", "https": "http://104.236.13.100:8888"}
>>> get_captcha_text('https://i.imgur.com/4u7PESk.png', proxies=proxies)
'hWA K n h'
"""
__auth__ = "wasi0013"
import os
try:
import requests
except:
os.system('pip install requests')
# also requires Google's Tesseract-OCR installed and, `tesseract` command must be available on terminal/cmd.
try:
import pytesseract
except:
os.system('pip install pytesseract')
from PIL import Image, ImageEnhance, ImageFilter
def clean_image(captcha_image, chop = 8):
"""
clean_image(PIL_IMAGE, [optionally chop])
takes a PIL_IMAGE and, removes all the stripes & grids using
the optional parameter `chop`
the default value of chop is set to 8
"""
width, height = captcha_image.size
data = captcha_image.load()
for y in range(height):
for x in range(width):
if data[x, y] > 128:
continue
total = 0
for c in range(x, width):
if data[c, y] < 128:
total += 1
else:
break
if total <= chop:
for c in range(total):
data[x + c, y] = 255
x += total
for x in range(width):
for y in range(height):
if data[x, y] > 128:
continue
total = 0
for c in range(y, height):
if data[x, c] < 128:
total += 1
else:
break
if total <= chop:
for c in range(total):
data[x, y + c] = 255
y += total
return True
def get_captcha_text(url, proxies = None):
"""
get_captcha_test(url) -> str
takes a captcha image url loads it as PIL Image applies various filters & call's clean_image method to remove all the stripes
& grids (if any left)
returns text from the image.
"""
# proxies = {"http": "http://104.236.13.100:8888",
# "https": "http://104.236.13.100:8888"}
image = None
if proxies:
image = requests.get(url, proxies=proxies, stream=True).raw
else:
image = requests.get(url, stream=True).raw
captcha_image = Image.open(image)
captcha_image = captcha_image.resize((1000, 200), Image.ANTIALIAS)
# captcha_image.save('captcha.png')
for i in range(5):
captcha_image = captcha_image.filter(ImageFilter.MedianFilter())
enhancer = ImageEnhance.Contrast(captcha_image)
captcha_image = enhancer.enhance(9)
# captcha_image.save('crap2.png')
captcha_image = captcha_image.convert('1')
# captcha_image.save('converted.png')
clean_image(captcha_image)
captcha_image = captcha_image.convert('RGBA')
enhancer = ImageEnhance.Sharpness(captcha_image)
captcha_image = enhancer.enhance(0)
# captcha_image.save('sharp.png')
enhancer = ImageEnhance.Contrast(captcha_image)
captcha_image = enhancer.enhance(4)
captcha_image = captcha_image.convert('1')
clean_image(captcha_image)
captcha_image.save('captcha.png')
text = pytesseract.image_to_string(Image.open('captcha.png'))
text = "".join(i for i in text if i.isalnum() or i == " ")
return text
url = "https://i.imgur.com/4u7PESk.png"
print(get_captcha_text(url))
@wasi0013
Copy link
Author

wasi0013 commented Sep 8, 2017

Sample url: https://i.imgur.com/4u7PESk.png
img

Sample output: hWA K n h

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment