wasi0013/captcha_solver.py

## captcha_solver.py
"""
Solves basic alpha numeric captchas using pytesseract and PIL
NOTE: This program is written for testing difficulties of captchas generated by captcha generator.
PLEASE, DO NOT USE IT FOR SPAMMING OR, ABUSING SYSTEMS!
Usage:
>>> get_captcha_text("https://i.imgur.com/4u7PESk.png")
'hWA K n h'

It is also possible to use proxy i.e:
>>> proxies = {"http": "http://104.236.13.100:8888", "https": "http://104.236.13.100:8888"}
>>> get_captcha_text('https://i.imgur.com/4u7PESk.png', proxies=proxies)
'hWA K n h'

"""

__auth__ = "wasi0013"

import os
try:
    import requests
except:
    os.system('pip install requests')

# also requires Google's Tesseract-OCR installed and, `tesseract` command must be available on terminal/cmd.
try:
    import pytesseract
except:
    os.system('pip install pytesseract')
from PIL import Image, ImageEnhance, ImageFilter

def clean_image(captcha_image, chop = 8):
    """
    clean_image(PIL_IMAGE, [optionally chop])

    takes a PIL_IMAGE and, removes all the stripes & grids using
    the optional parameter `chop`
    the default value of chop is set to 8
    """
    width, height = captcha_image.size
    data = captcha_image.load()
    for y in range(height):
        for x in range(width):
            if data[x, y] > 128:
                continue
            total = 0
            for c in range(x, width):
                if data[c, y] < 128:
                    total += 1
                else:
                    break
            if total <= chop:
                for c in range(total):
                    data[x + c, y] = 255
            x += total
    for x in range(width):
        for y in range(height):
            if data[x, y] > 128:
                continue
            total = 0
            for c in range(y, height):
                if data[x, c] < 128:
                    total += 1
                else:
                    break
            if total <= chop:
                for c in range(total):
                    data[x, y + c] = 255
            y += total
    return True

def get_captcha_text(url, proxies = None):
    """
    get_captcha_test(url) -> str

    takes a captcha image url loads it as PIL Image applies various filters & call's clean_image method to remove all the stripes
    & grids (if any left)
    returns text from the image.
    """
    # proxies = {"http": "http://104.236.13.100:8888",
    #       "https": "http://104.236.13.100:8888"}

    image = None
    if proxies:
        image = requests.get(url, proxies=proxies, stream=True).raw
    else:
        image = requests.get(url, stream=True).raw

    captcha_image = Image.open(image)
    captcha_image = captcha_image.resize((1000, 200), Image.ANTIALIAS)
    # captcha_image.save('captcha.png')
    for i in range(5):
        captcha_image = captcha_image.filter(ImageFilter.MedianFilter())

    enhancer = ImageEnhance.Contrast(captcha_image)
    captcha_image = enhancer.enhance(9)
    # captcha_image.save('crap2.png')
    captcha_image = captcha_image.convert('1')
    # captcha_image.save('converted.png')

    clean_image(captcha_image)

    captcha_image = captcha_image.convert('RGBA')
    enhancer = ImageEnhance.Sharpness(captcha_image)
    captcha_image = enhancer.enhance(0)
    # captcha_image.save('sharp.png')

    enhancer = ImageEnhance.Contrast(captcha_image)
    captcha_image = enhancer.enhance(4)
    captcha_image = captcha_image.convert('1')
    clean_image(captcha_image)
    captcha_image.save('captcha.png')

    text = pytesseract.image_to_string(Image.open('captcha.png'))
    text = "".join(i for i in text if i.isalnum() or i == " ")
    return text

url = "https://i.imgur.com/4u7PESk.png"
print(get_captcha_text(url))
	"""
	Solves basic alpha numeric captchas using pytesseract and PIL
	NOTE: This program is written for testing difficulties of captchas generated by captcha generator.
	PLEASE, DO NOT USE IT FOR SPAMMING OR, ABUSING SYSTEMS!
	Usage:
	>>> get_captcha_text("https://i.imgur.com/4u7PESk.png")
	'hWA K n h'

	It is also possible to use proxy i.e:
	>>> proxies = {"http": "http://104.236.13.100:8888", "https": "http://104.236.13.100:8888"}
	>>> get_captcha_text('https://i.imgur.com/4u7PESk.png', proxies=proxies)
	'hWA K n h'

	"""

	__auth__ = "wasi0013"

	import os
	try:
	import requests
	except:
	os.system('pip install requests')

	# also requires Google's Tesseract-OCR installed and, `tesseract` command must be available on terminal/cmd.
	try:
	import pytesseract
	except:
	os.system('pip install pytesseract')
	from PIL import Image, ImageEnhance, ImageFilter

	def clean_image(captcha_image, chop = 8):
	"""
	clean_image(PIL_IMAGE, [optionally chop])

	takes a PIL_IMAGE and, removes all the stripes & grids using
	the optional parameter `chop`
	the default value of chop is set to 8
	"""
	width, height = captcha_image.size
	data = captcha_image.load()
	for y in range(height):
	for x in range(width):
	if data[x, y] > 128:
	continue
	total = 0
	for c in range(x, width):
	if data[c, y] < 128:
	total += 1
	else:
	break
	if total <= chop:
	for c in range(total):
	data[x + c, y] = 255
	x += total
	for x in range(width):
	for y in range(height):
	if data[x, y] > 128:
	continue
	total = 0
	for c in range(y, height):
	if data[x, c] < 128:
	total += 1
	else:
	break
	if total <= chop:
	for c in range(total):
	data[x, y + c] = 255
	y += total
	return True

	def get_captcha_text(url, proxies = None):
	"""
	get_captcha_test(url) -> str

	takes a captcha image url loads it as PIL Image applies various filters & call's clean_image method to remove all the stripes
	& grids (if any left)
	returns text from the image.
	"""
	# proxies = {"http": "http://104.236.13.100:8888",
	# "https": "http://104.236.13.100:8888"}

	image = None
	if proxies:
	image = requests.get(url, proxies=proxies, stream=True).raw
	else:
	image = requests.get(url, stream=True).raw

	captcha_image = Image.open(image)
	captcha_image = captcha_image.resize((1000, 200), Image.ANTIALIAS)
	# captcha_image.save('captcha.png')
	for i in range(5):
	captcha_image = captcha_image.filter(ImageFilter.MedianFilter())

	enhancer = ImageEnhance.Contrast(captcha_image)
	captcha_image = enhancer.enhance(9)
	# captcha_image.save('crap2.png')
	captcha_image = captcha_image.convert('1')
	# captcha_image.save('converted.png')

	clean_image(captcha_image)

	captcha_image = captcha_image.convert('RGBA')
	enhancer = ImageEnhance.Sharpness(captcha_image)
	captcha_image = enhancer.enhance(0)
	# captcha_image.save('sharp.png')

	enhancer = ImageEnhance.Contrast(captcha_image)
	captcha_image = enhancer.enhance(4)
	captcha_image = captcha_image.convert('1')
	clean_image(captcha_image)
	captcha_image.save('captcha.png')

	text = pytesseract.image_to_string(Image.open('captcha.png'))
	text = "".join(i for i in text if i.isalnum() or i == " ")
	return text

	url = "https://i.imgur.com/4u7PESk.png"
	print(get_captcha_text(url))