Skip to content

Instantly share code, notes, and snippets.

@fta2012
Last active August 29, 2015 14:06
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fta2012/034b0686897d94e74b00 to your computer and use it in GitHub Desktop.
Save fta2012/034b0686897d94e74b00 to your computer and use it in GitHub Desktop.
Quick modification to https://github.com/fta2012/ProjectEulerCaptchaSolver/blob/master/solver.py for solving captchas found in https://github.com/mieko/sr-captcha/blob/gh-pages/captcha-corpus.tar.bz2. Replace every 'digit' you see with 'character' to make sense of the code. Original captcha this was written for only had digits.
import scipy.misc
from os import listdir
from collections import defaultdict
def extract_digits(img):
# Threshold color
image = [[[255, 255, 255] for cell in row] for row in img]
for i, row in enumerate(img):
for j, cell in enumerate(row):
if 400 < sum(cell) < 570:
image[i][j] = [0, 0, 0]
# Find contiguous regions
flooded = set()
digits = []
for y in xrange(len(image[0])):
for x in xrange(len(image)):
if (x, y) not in flooded:
flooded.add((x, y))
region = [(x, y)]
pos = 0
while pos < len(region):
xx, yy = region[pos]
pos += 1
for dx in [-2, -1, 0, 1, 2]:
for dy in [-2, -1, 0, 1, 2]:
if dx == dy == 0:
continue
if (0 <= xx + dx < len(image) and
0 <= yy + dy < len(image[0]) and
tuple(image[xx + dx][yy + dy]) == tuple(image[x][y]) and
(xx + dx, yy + dy) not in flooded):
region.append((xx + dx, yy + dy))
flooded.add((xx + dx, yy + dy))
# Filter
if image[x][y] != [255, 255, 255] and len(region) > 5:
digits.append(region)
return digits
N = 25
def normalized_image(pixels):
min_x = min(x for x, y in pixels)
min_y = min(y for x, y in pixels)
img = [[0 for y in xrange(N)] for x in xrange(N)]
for x, y in pixels:
img[x - min_x][y - min_y] = 1
return img
# Load the labeled captchas and separate them into digits
directory = 'captcha-corpus'
labeled_digits = []
digit_areas = defaultdict(list)
print 'Training'
for filename in listdir(directory)[500:1000]:
label = filename.split('.')[0]
digits = extract_digits(scipy.misc.imread(directory + '/' + filename))
if len(label) != len(digits):
print 'skipped labeled image', label
continue
for i, digit in enumerate(digits):
img = normalized_image(digit)
labeled_digits.append((label[i], img))
digit_areas[label[i]].append(sum(val for row in img for val in row))
#scipy.misc.imsave('digit_{:s}_{:s}_{:s}.png'.format(label[i], str(i), label), img)
average_digit_area = {k : (sum(v) / len(v)) for k, v in digit_areas.iteritems()}
def solve_for_image(image):
digits = extract_digits(image)
guess = ''
for digit in digits:
img = normalized_image(digit)
best = (N ** 2, None)
for num, labeled_digit in labeled_digits:
dist = sum(img[x][y] != labeled_digit[x][y] for x in xrange(N) for y in xrange(N))
if abs(average_digit_area[num] - len(digit)) < 20:
best = min(best, (dist, num))
guess += str(best[1])
return guess
def solve_for_filename(filename):
return solve_for_image(scipy.misc.imread(filename))
if __name__ == '__main__':
print 'Classifying'
count = 100
wrong = 0
for filename in listdir(directory)[:count]:
label = filename.split('.')[0]
predicted = solve_for_filename(directory + '/' + filename)
if label != predicted:
print predicted, label
wrong += 1
else:
print predicted
print wrong, count, float(wrong) / count
# Output:
'''
Training
skipped labeled image flex495
skipped labeled image galil703
skipped labeled image geria229
skipped labeled image germi850
skipped labeled image gibbs901
skipped labeled image heirl278
skipped labeled image hoop19
skipped labeled image house553
skipped labeled image human970
skipped labeled image hyeni569
skipped labeled image inani632
skipped labeled image indep415
skipped labeled image integ467
skipped labeled image inter28
skipped labeled image iodid634
skipped labeled image jacul766
skipped labeled image jujit844
skipped labeled image kimch758
skipped labeled image lean745
skipped labeled image lochi371
skipped labeled image objec165
skipped labeled image orali875
skipped labeled image peaki92
Classifying
abate328
abett394
abide670
abort213
acces203
acces252
accou196
accus213
aceph370
aceti517
achen516
acnui284 acqui284
acuui757 acqui757
acrit320
acule603
acupr469
acupu215
acyla825
adjus319
adjus503
admix577 admix572
adsor290
aethe775
afflu996
aflat546
agend64h agend645
agitp88
agrcee712 agree712
agrib896
aibsh534 airsh534
album788
alche973
alcoh292
aldol715
alexa391
alfak517
algit803 algid803
alien756
alkal541
allay633
aller122
allia64
allcit32 allit32
allot410
allox886
almuc251
aluni852
amara709
ambul882
amicda540 amida540
among722
amora5
anago1
anale229
arale459 anale459
anape570
anath743
anhyh81l anhyd811
anima325
aniso450
aniso807
ankle568
antep209
antho948
anthr894
antlp634 antip634
antip878
antis421
antit231
apert699
apica644
appan708
apply523
aprax38
apsid497
aptit976 aptit972
aouat983 aquat983
arbor305
archi14l archi141
arena138
argen369
argil750
jargo822 argo822
argue538
armba582
armfn381 armfu381
aroma802
arpen264
arriv25
arsen500
arter107
ashla667
assai305
assen935
asser551
astra488
ating951
attac495
attai758
attri573
17 100 0.17
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment