fta2012/solver.py Secret

## solver.py
import scipy.misc
from os import listdir
from collections import defaultdict

def extract_digits(img):
    # Threshold color
    image = [[[255, 255, 255] for cell in row] for row in img]
    for i, row in enumerate(img):
        for j, cell in enumerate(row):
            if 400 < sum(cell) < 570:
                image[i][j] = [0, 0, 0]

    # Find contiguous regions
    flooded = set()
    digits = []
    for y in xrange(len(image[0])):
        for x in xrange(len(image)):
            if (x, y) not in flooded:
                flooded.add((x, y))
                region = [(x, y)]
                pos = 0
                while pos < len(region):
                    xx, yy = region[pos]
                    pos += 1
                    for dx in [-2, -1, 0, 1, 2]:
                        for dy in [-2, -1, 0, 1, 2]:
                            if dx == dy == 0:
                                continue
                            if (0 <= xx + dx < len(image) and
                                0 <= yy + dy < len(image[0]) and
                                tuple(image[xx + dx][yy + dy]) == tuple(image[x][y]) and
                                (xx + dx, yy + dy) not in flooded):
                                region.append((xx + dx, yy + dy))
                                flooded.add((xx + dx, yy + dy))
                # Filter
                if image[x][y] != [255, 255, 255] and len(region) > 5:
                    digits.append(region)

    return digits

N = 25
def normalized_image(pixels):
    min_x = min(x for x, y in pixels)
    min_y = min(y for x, y in pixels)
    img = [[0 for y in xrange(N)] for x in xrange(N)]
    for x, y in pixels:
        img[x - min_x][y - min_y] = 1
    return img

# Load the labeled captchas and separate them into digits
directory = 'captcha-corpus'
labeled_digits = []
digit_areas = defaultdict(list)
print 'Training'
for filename in listdir(directory)[500:1000]:
    label = filename.split('.')[0]
    digits = extract_digits(scipy.misc.imread(directory + '/' + filename))
    if len(label) != len(digits):
        print 'skipped labeled image', label
        continue

    for i, digit in enumerate(digits):
        img = normalized_image(digit)
        labeled_digits.append((label[i], img))
        digit_areas[label[i]].append(sum(val for row in img for val in row))
        #scipy.misc.imsave('digit_{:s}_{:s}_{:s}.png'.format(label[i], str(i), label), img)
average_digit_area = {k : (sum(v) / len(v)) for k, v in digit_areas.iteritems()}

def solve_for_image(image):
    digits = extract_digits(image)
    guess = ''
    for digit in digits:
        img = normalized_image(digit)
        best = (N ** 2, None)
        for num, labeled_digit in labeled_digits:
            dist = sum(img[x][y] != labeled_digit[x][y] for x in xrange(N) for y in xrange(N))
            if abs(average_digit_area[num] - len(digit)) < 20:
                best = min(best, (dist, num))

        guess += str(best[1])

    return guess

def solve_for_filename(filename):
    return solve_for_image(scipy.misc.imread(filename))

if __name__ == '__main__':
    print 'Classifying'
    count = 100
    wrong = 0
    for filename in listdir(directory)[:count]:
        label = filename.split('.')[0]
        predicted = solve_for_filename(directory + '/' + filename)
        if label != predicted:
            print predicted, label
            wrong += 1
        else:
            print predicted
    print wrong, count, float(wrong) / count

# Output:
'''
Training
skipped labeled image flex495
skipped labeled image galil703
skipped labeled image geria229
skipped labeled image germi850
skipped labeled image gibbs901
skipped labeled image heirl278
skipped labeled image hoop19
skipped labeled image house553
skipped labeled image human970
skipped labeled image hyeni569
skipped labeled image inani632
skipped labeled image indep415
skipped labeled image integ467
skipped labeled image inter28
skipped labeled image iodid634
skipped labeled image jacul766
skipped labeled image jujit844
skipped labeled image kimch758
skipped labeled image lean745
skipped labeled image lochi371
skipped labeled image objec165
skipped labeled image orali875
skipped labeled image peaki92
Classifying
abate328
abett394
abide670
abort213
acces203
acces252
accou196
accus213
aceph370
aceti517
achen516
acnui284 acqui284
acuui757 acqui757
acrit320
acule603
acupr469
acupu215
acyla825
adjus319
adjus503
admix577 admix572
adsor290
aethe775
afflu996
aflat546
agend64h agend645
agitp88
agrcee712 agree712
agrib896
aibsh534 airsh534
album788
alche973
alcoh292
aldol715
alexa391
alfak517
algit803 algid803
alien756
alkal541
allay633
aller122
allia64
allcit32 allit32
allot410
allox886
almuc251
aluni852
amara709
ambul882
amicda540 amida540
among722
amora5
anago1
anale229
arale459 anale459
anape570
anath743
anhyh81l anhyd811
anima325
aniso450
aniso807
ankle568
antep209
antho948
anthr894
antlp634 antip634
antip878
antis421
antit231
apert699
apica644
appan708
apply523
aprax38
apsid497
aptit976 aptit972
aouat983 aquat983
arbor305
archi14l archi141
arena138
argen369
argil750
jargo822 argo822
argue538
armba582
armfn381 armfu381
aroma802
arpen264
arriv25
arsen500
arter107
ashla667
assai305
assen935
asser551
astra488
ating951
attac495
attai758
attri573
17 100 0.17
'''
	import scipy.misc
	from os import listdir
	from collections import defaultdict

	def extract_digits(img):
	# Threshold color
	image = [[[255, 255, 255] for cell in row] for row in img]
	for i, row in enumerate(img):
	for j, cell in enumerate(row):
	if 400 < sum(cell) < 570:
	image[i][j] = [0, 0, 0]

	# Find contiguous regions
	flooded = set()
	digits = []
	for y in xrange(len(image[0])):
	for x in xrange(len(image)):
	if (x, y) not in flooded:
	flooded.add((x, y))
	region = [(x, y)]
	pos = 0
	while pos < len(region):
	xx, yy = region[pos]
	pos += 1
	for dx in [-2, -1, 0, 1, 2]:
	for dy in [-2, -1, 0, 1, 2]:
	if dx == dy == 0:
	continue
	if (0 <= xx + dx < len(image) and
	0 <= yy + dy < len(image[0]) and
	tuple(image[xx + dx][yy + dy]) == tuple(image[x][y]) and
	(xx + dx, yy + dy) not in flooded):
	region.append((xx + dx, yy + dy))
	flooded.add((xx + dx, yy + dy))
	# Filter
	if image[x][y] != [255, 255, 255] and len(region) > 5:
	digits.append(region)

	return digits

	N = 25
	def normalized_image(pixels):
	min_x = min(x for x, y in pixels)
	min_y = min(y for x, y in pixels)
	img = [[0 for y in xrange(N)] for x in xrange(N)]
	for x, y in pixels:
	img[x - min_x][y - min_y] = 1
	return img

	# Load the labeled captchas and separate them into digits
	directory = 'captcha-corpus'
	labeled_digits = []
	digit_areas = defaultdict(list)
	print 'Training'
	for filename in listdir(directory)[500:1000]:
	label = filename.split('.')[0]
	digits = extract_digits(scipy.misc.imread(directory + '/' + filename))
	if len(label) != len(digits):
	print 'skipped labeled image', label
	continue

	for i, digit in enumerate(digits):
	img = normalized_image(digit)
	labeled_digits.append((label[i], img))
	digit_areas[label[i]].append(sum(val for row in img for val in row))
	#scipy.misc.imsave('digit_{:s}_{:s}_{:s}.png'.format(label[i], str(i), label), img)
	average_digit_area = {k : (sum(v) / len(v)) for k, v in digit_areas.iteritems()}

	def solve_for_image(image):
	digits = extract_digits(image)
	guess = ''
	for digit in digits:
	img = normalized_image(digit)
	best = (N ** 2, None)
	for num, labeled_digit in labeled_digits:
	dist = sum(img[x][y] != labeled_digit[x][y] for x in xrange(N) for y in xrange(N))
	if abs(average_digit_area[num] - len(digit)) < 20:
	best = min(best, (dist, num))

	guess += str(best[1])

	return guess

	def solve_for_filename(filename):
	return solve_for_image(scipy.misc.imread(filename))

	if __name__ == '__main__':
	print 'Classifying'
	count = 100
	wrong = 0
	for filename in listdir(directory)[:count]:
	label = filename.split('.')[0]
	predicted = solve_for_filename(directory + '/' + filename)
	if label != predicted:
	print predicted, label
	wrong += 1
	else:
	print predicted
	print wrong, count, float(wrong) / count

	# Output:
	'''
	Training
	skipped labeled image flex495
	skipped labeled image galil703
	skipped labeled image geria229
	skipped labeled image germi850
	skipped labeled image gibbs901
	skipped labeled image heirl278
	skipped labeled image hoop19
	skipped labeled image house553
	skipped labeled image human970
	skipped labeled image hyeni569
	skipped labeled image inani632
	skipped labeled image indep415
	skipped labeled image integ467
	skipped labeled image inter28
	skipped labeled image iodid634
	skipped labeled image jacul766
	skipped labeled image jujit844
	skipped labeled image kimch758
	skipped labeled image lean745
	skipped labeled image lochi371
	skipped labeled image objec165
	skipped labeled image orali875
	skipped labeled image peaki92
	Classifying
	abate328
	abett394
	abide670
	abort213
	acces203
	acces252
	accou196
	accus213
	aceph370
	aceti517
	achen516
	acnui284 acqui284
	acuui757 acqui757
	acrit320
	acule603
	acupr469
	acupu215
	acyla825
	adjus319
	adjus503
	admix577 admix572
	adsor290
	aethe775
	afflu996
	aflat546
	agend64h agend645
	agitp88
	agrcee712 agree712
	agrib896
	aibsh534 airsh534
	album788
	alche973
	alcoh292
	aldol715
	alexa391
	alfak517
	algit803 algid803
	alien756
	alkal541
	allay633
	aller122
	allia64
	allcit32 allit32
	allot410
	allox886
	almuc251
	aluni852
	amara709
	ambul882
	amicda540 amida540
	among722
	amora5
	anago1
	anale229
	arale459 anale459
	anape570
	anath743
	anhyh81l anhyd811
	anima325
	aniso450
	aniso807
	ankle568
	antep209
	antho948
	anthr894
	antlp634 antip634
	antip878
	antis421
	antit231
	apert699
	apica644
	appan708
	apply523
	aprax38
	apsid497
	aptit976 aptit972
	aouat983 aquat983
	arbor305
	archi14l archi141
	arena138
	argen369
	argil750
	jargo822 argo822
	argue538
	armba582
	armfn381 armfu381
	aroma802
	arpen264
	arriv25
	arsen500
	arter107
	ashla667
	assai305
	assen935
	asser551
	astra488
	ating951
	attac495
	attai758
	attri573
	17 100 0.17
	'''