macabeus/breakCaptcha.py

## breakCaptcha.py
# Download of captcha's image: https://www.dropbox.com/s/6dbo7k4vuqlqiqr/captchar.zip
# Post in my blog about this code (in Brazillian Portuguese): http://localhost:8000/output/estudo-de-caso-quebrando-um-captcha.html

import copy
import numpy as np
import cv2 as cv
import pyslibtesseract

tesseract_config = pyslibtesseract.TesseractConfig(psm=pyslibtesseract.PageSegMode.PSM_SINGLE_CHAR)
tesseract_config.add_variable('tessedit_char_whitelist', 'QWERTYUIOPASDFGHJKLZXCVBNM')

class Steps:
    def __init__(self):
        self.steps = []

    def append(self, current, x):
        self.steps.append((x, len(self), copy.copy(current)))
        #                     ^ isso aqui é só para no sorted/max não caírem até a imagem, o que causaria bug

    def show(self, height, width, text='?', title='foo'):
        padding = 5

        from collections import Counter

        steps_x = [i[0] for i in self.steps]
        most_common = Counter(steps_x).most_common(1)[0][1]
        max_steps_x = max(steps_x)
        space_for_text = height
        img_place = np.zeros((height * most_common + space_for_text,
                              width * max_steps_x + (padding * (len(set(steps_x)) - 1)) + width,
                              3),
                             np.uint8)

        steps_sorted = sorted(self.steps)

        count_repeat = None
        previous = None
        for i in steps_sorted:
            if i[0] == previous:
                count_repeat += 1
            else:
                count_repeat = 0
                previous = i[0]

            img_place[count_repeat * height:(count_repeat + 1) * height,
                      i[0] * padding + i[0] * width:i[0] * padding + (i[0] + 1) * width] = i[2]

        cv.putText(img_place, text, (10, height * most_common + height - 20), cv.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255))

        cv.imshow(title, img_place)
        cv.waitKey()

    def __getitem__(self, item):
        return self.steps[item]

    def __len__(self):
        return len(self.steps)

def read_captcha(file_name):
    print('reading...')

    img_start = cv.imread(file_name)
    height, width = img_start.shape[:2]

    # limpar topo da imagem
    img_start[0:15, 0:width] = np.full((15, width, 3), 255)

    steps = Steps()
    steps.append(img_start, 0)

    img = copy.copy(img_start)

    ###
    # borrar, para tirar os ruídos
    img = cv.morphologyEx(img, cv.MORPH_CLOSE, np.ones((3, 3), np.uint8))

    steps.append(img, 0)

    ###
    # deixar a imagem em tons de cinza e depois apagar os pixels fracos de mais
    img = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
    img = cv.cvtColor(img, cv.COLOR_GRAY2BGR)

    for i in img:
        for i2 in i:
            if (np.array([230, 230, 230]) >= i2).any():
                i2[...] = 0
            else:
                i2[...] = 255

    steps.append(img, 0)

    ###
    # preciso desgrudar as letras que estiverem colodas na borda
    img[height - 3:height, 0:width] = np.full((3, width, 3), 255)
    img[0:height, 0:3] = np.full((height, 3, 3), 255)
    img[0:height, width - 3:width] = np.full((height, 3, 3), 255)

    ###
    # apagar pontos pequenos, que podem atrapalhar o tesseract
    # partes grandes não é bom apagar, pois pode ser parte da letra essencial para o tesseract entender qual é
    # partes pequenas costumam mais atrapalhar que ajudar
    im2, contours, hierarchy = cv.findContours(cv.cvtColor(img, cv.COLOR_BGR2GRAY), cv.RETR_TREE, cv.CHAIN_APPROX_SIMPLE)

    for i2 in contours:
        (x, y), radius = cv.minEnclosingCircle(i2)

        if radius <= 6: cv.drawContours(img, [i2], -1, (255, 255, 255), -1) steps.append(img, 0) ### # isolar cada letra letters = [] img_circulada = copy.copy(img) for i2 in contours: (x, y), radius = cv.minEnclosingCircle(i2) if radius > 16 and radius > 30:
            print(radius)
            center = (int(x), int(y))

            radius_int = int(radius + 15)

            x_min = x - radius_int
            y_min = y - radius_int
            if x_min < 0:
                x_min = 0
            if y_min < 0: y_min = 0 letters.append((x, img[y_min : y + radius_int, x_min : x + radius_int])) cv.circle(img_circulada, center, radius_int + 1, (0,255,0), 1) steps.append(img_circulada, 0) letters_out = [] if len(letters) > 0:
        # ler as letras isoladas
        loop = 0
        text = ''
        for i in letters:
            current_letter = i[1]

            # Salvar para tesseraczar
            cv.imwrite('letter' + str(loop) + '.png', current_letter)

            # Pegar o valor ASCII da letra
            new_char = rotate('letter' + str(loop) + '.png')[0]
            if len(new_char) and new_char[0] != ' ':
                text += new_char[0]
            else:
                text += '?'

            loop += 1
            letters_out.append((i[0], new_char[0]))

    letters_out = sorted(letters_out)
    letters_only = [i[1] for i in letters_out]
    steps.show(height, width, text=''.join(letters_only), title=file_name)

def rotate(file_name):
    img = cv.imread(file_name)
    rows, cols = img.shape[:2]

    most_confidence = [' ', 0]
    again = False

    for i in range(-1, 2):
        M = cv.getRotationMatrix2D((cols/2,rows/2), 10 * i, 1)
        dst = cv.warpAffine(img,M,(cols,rows))

        letter_height, letter_width = dst.shape[:2]

        mask = np.zeros((letter_height + 2, letter_width + 2), np.uint8)
        mask[:] = 0
        for h in range(letter_height):
            cv.floodFill(dst, mask, (letter_width - 1, h), (255, 255, 255), upDiff=(200, 200, 200))
            cv.floodFill(dst, mask, (0, h), (255, 255, 255), upDiff=(200, 200, 200))

        for w in range(letter_width):
            cv.floodFill(dst, mask, (w, 0), (255, 255, 255), upDiff=(200, 200, 200))
            cv.floodFill(dst, mask, (w, letter_height - 1), (255, 255, 255), upDiff=(200, 200, 200))

        cv.imwrite(str(i) + file_name, dst)

        #print(str(i) + file_name)
        x = pyslibtesseract.LibTesseract.read_and_get_confidence_char(tesseract_config, str(i) + file_name)
        if len(x) == 0:
            continue
        new_char = x[0]
        if most_confidence[1] - 3 <= new_char[1] <= most_confidence[1] + 3:
            again = False
        else:
            again = True
        if new_char[0] != ' ' and most_confidence[1] < new_char[1]:
            most_confidence[0] = new_char[0]
            most_confidence[1] = new_char[1]
        #print(new_char)
        #print('--------------------------------')

    if most_confidence[1] < 60 or again:
        for i in range(-5, 6):
            M = cv.getRotationMatrix2D((cols/2,rows/2), 10 * i, 1)
            dst = cv.warpAffine(img,M,(cols,rows))

            letter_height, letter_width = dst.shape[:2]

            mask = np.zeros((letter_height + 2, letter_width + 2), np.uint8)
            mask[:] = 0
            for h in range(letter_height):
                cv.floodFill(dst, mask, (letter_width - 1, h), (255, 255, 255), upDiff=(200, 200, 200))
                cv.floodFill(dst, mask, (0, h), (255, 255, 255), upDiff=(200, 200, 200))

            for w in range(letter_width):
                cv.floodFill(dst, mask, (w, 0), (255, 255, 255), upDiff=(200, 200, 200))
                cv.floodFill(dst, mask, (w, letter_height - 1), (255, 255, 255), upDiff=(200, 200, 200))

            cv.imwrite(str(i) + file_name, dst)

            #print(str(i) + file_name)
            x = pyslibtesseract.LibTesseract.read_and_get_confidence_char(tesseract_config, str(i) + file_name)
            if len(x) == 0:
                continue
            new_char = x[0]
            if new_char[0] != ' ' and most_confidence[1] < new_char[1]:
                most_confidence[0] = new_char[0]
                most_confidence[1] = new_char[1]
            #print(new_char)
            #print('--------------------------------')

    return most_confidence

for i in range(31):
    read_captcha('captcha' + str(i) + '.jpg')
	# Download of captcha's image: https://www.dropbox.com/s/6dbo7k4vuqlqiqr/captchar.zip
	# Post in my blog about this code (in Brazillian Portuguese): http://localhost:8000/output/estudo-de-caso-quebrando-um-captcha.html

	import copy
	import numpy as np
	import cv2 as cv
	import pyslibtesseract

	tesseract_config = pyslibtesseract.TesseractConfig(psm=pyslibtesseract.PageSegMode.PSM_SINGLE_CHAR)
	tesseract_config.add_variable('tessedit_char_whitelist', 'QWERTYUIOPASDFGHJKLZXCVBNM')

	class Steps:
	def __init__(self):
	self.steps = []

	def append(self, current, x):
	self.steps.append((x, len(self), copy.copy(current)))
	# ^ isso aqui é só para no sorted/max não caírem até a imagem, o que causaria bug

	def show(self, height, width, text='?', title='foo'):
	padding = 5

	from collections import Counter

	steps_x = [i[0] for i in self.steps]
	most_common = Counter(steps_x).most_common(1)[0][1]
	max_steps_x = max(steps_x)
	space_for_text = height
	img_place = np.zeros((height * most_common + space_for_text,
	width * max_steps_x + (padding * (len(set(steps_x)) - 1)) + width,
	3),
	np.uint8)

	steps_sorted = sorted(self.steps)

	count_repeat = None
	previous = None
	for i in steps_sorted:
	if i[0] == previous:
	count_repeat += 1
	else:
	count_repeat = 0
	previous = i[0]

	img_place[count_repeat * height:(count_repeat + 1) * height,
	i[0] * padding + i[0] * width:i[0] * padding + (i[0] + 1) * width] = i[2]

	cv.putText(img_place, text, (10, height * most_common + height - 20), cv.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255))

	cv.imshow(title, img_place)
	cv.waitKey()

	def __getitem__(self, item):
	return self.steps[item]

	def __len__(self):
	return len(self.steps)

	def read_captcha(file_name):
	print('reading...')

	img_start = cv.imread(file_name)
	height, width = img_start.shape[:2]

	# limpar topo da imagem
	img_start[0:15, 0:width] = np.full((15, width, 3), 255)

	steps = Steps()
	steps.append(img_start, 0)

	img = copy.copy(img_start)

	###
	# borrar, para tirar os ruídos
	img = cv.morphologyEx(img, cv.MORPH_CLOSE, np.ones((3, 3), np.uint8))

	steps.append(img, 0)

	###
	# deixar a imagem em tons de cinza e depois apagar os pixels fracos de mais
	img = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
	img = cv.cvtColor(img, cv.COLOR_GRAY2BGR)

	for i in img:
	for i2 in i:
	if (np.array([230, 230, 230]) >= i2).any():
	i2[...] = 0
	else:
	i2[...] = 255

	steps.append(img, 0)

	###
	# preciso desgrudar as letras que estiverem colodas na borda
	img[height - 3:height, 0:width] = np.full((3, width, 3), 255)
	img[0:height, 0:3] = np.full((height, 3, 3), 255)
	img[0:height, width - 3:width] = np.full((height, 3, 3), 255)

	###
	# apagar pontos pequenos, que podem atrapalhar o tesseract
	# partes grandes não é bom apagar, pois pode ser parte da letra essencial para o tesseract entender qual é
	# partes pequenas costumam mais atrapalhar que ajudar
	im2, contours, hierarchy = cv.findContours(cv.cvtColor(img, cv.COLOR_BGR2GRAY), cv.RETR_TREE, cv.CHAIN_APPROX_SIMPLE)

	for i2 in contours:
	(x, y), radius = cv.minEnclosingCircle(i2)

	if radius <= 6: cv.drawContours(img, [i2], -1, (255, 255, 255), -1) steps.append(img, 0) ### # isolar cada letra letters = [] img_circulada = copy.copy(img) for i2 in contours: (x, y), radius = cv.minEnclosingCircle(i2) if radius > 16 and radius > 30:
	print(radius)
	center = (int(x), int(y))

	radius_int = int(radius + 15)

	x_min = x - radius_int
	y_min = y - radius_int
	if x_min < 0:
	x_min = 0
	if y_min < 0: y_min = 0 letters.append((x, img[y_min : y + radius_int, x_min : x + radius_int])) cv.circle(img_circulada, center, radius_int + 1, (0,255,0), 1) steps.append(img_circulada, 0) letters_out = [] if len(letters) > 0:
	# ler as letras isoladas
	loop = 0
	text = ''
	for i in letters:
	current_letter = i[1]

	# Salvar para tesseraczar
	cv.imwrite('letter' + str(loop) + '.png', current_letter)

	# Pegar o valor ASCII da letra
	new_char = rotate('letter' + str(loop) + '.png')[0]
	if len(new_char) and new_char[0] != ' ':
	text += new_char[0]
	else:
	text += '?'

	loop += 1
	letters_out.append((i[0], new_char[0]))

	letters_out = sorted(letters_out)
	letters_only = [i[1] for i in letters_out]
	steps.show(height, width, text=''.join(letters_only), title=file_name)

	def rotate(file_name):
	img = cv.imread(file_name)
	rows, cols = img.shape[:2]

	most_confidence = [' ', 0]
	again = False

	for i in range(-1, 2):
	M = cv.getRotationMatrix2D((cols/2,rows/2), 10 * i, 1)
	dst = cv.warpAffine(img,M,(cols,rows))

	letter_height, letter_width = dst.shape[:2]

	mask = np.zeros((letter_height + 2, letter_width + 2), np.uint8)
	mask[:] = 0
	for h in range(letter_height):
	cv.floodFill(dst, mask, (letter_width - 1, h), (255, 255, 255), upDiff=(200, 200, 200))
	cv.floodFill(dst, mask, (0, h), (255, 255, 255), upDiff=(200, 200, 200))

	for w in range(letter_width):
	cv.floodFill(dst, mask, (w, 0), (255, 255, 255), upDiff=(200, 200, 200))
	cv.floodFill(dst, mask, (w, letter_height - 1), (255, 255, 255), upDiff=(200, 200, 200))

	cv.imwrite(str(i) + file_name, dst)

	#print(str(i) + file_name)
	x = pyslibtesseract.LibTesseract.read_and_get_confidence_char(tesseract_config, str(i) + file_name)
	if len(x) == 0:
	continue
	new_char = x[0]
	if most_confidence[1] - 3 <= new_char[1] <= most_confidence[1] + 3:
	again = False
	else:
	again = True
	if new_char[0] != ' ' and most_confidence[1] < new_char[1]:
	most_confidence[0] = new_char[0]
	most_confidence[1] = new_char[1]
	#print(new_char)
	#print('--------------------------------')

	if most_confidence[1] < 60 or again:
	for i in range(-5, 6):
	M = cv.getRotationMatrix2D((cols/2,rows/2), 10 * i, 1)
	dst = cv.warpAffine(img,M,(cols,rows))

	letter_height, letter_width = dst.shape[:2]

	mask = np.zeros((letter_height + 2, letter_width + 2), np.uint8)
	mask[:] = 0
	for h in range(letter_height):
	cv.floodFill(dst, mask, (letter_width - 1, h), (255, 255, 255), upDiff=(200, 200, 200))
	cv.floodFill(dst, mask, (0, h), (255, 255, 255), upDiff=(200, 200, 200))

	for w in range(letter_width):
	cv.floodFill(dst, mask, (w, 0), (255, 255, 255), upDiff=(200, 200, 200))
	cv.floodFill(dst, mask, (w, letter_height - 1), (255, 255, 255), upDiff=(200, 200, 200))

	cv.imwrite(str(i) + file_name, dst)

	#print(str(i) + file_name)
	x = pyslibtesseract.LibTesseract.read_and_get_confidence_char(tesseract_config, str(i) + file_name)
	if len(x) == 0:
	continue
	new_char = x[0]
	if new_char[0] != ' ' and most_confidence[1] < new_char[1]:
	most_confidence[0] = new_char[0]
	most_confidence[1] = new_char[1]
	#print(new_char)
	#print('--------------------------------')

	return most_confidence

	for i in range(31):
	read_captcha('captcha' + str(i) + '.jpg')