-
-
Save fta2012/034b0686897d94e74b00 to your computer and use it in GitHub Desktop.
Quick modification to https://github.com/fta2012/ProjectEulerCaptchaSolver/blob/master/solver.py for solving captchas found in https://github.com/mieko/sr-captcha/blob/gh-pages/captcha-corpus.tar.bz2. Replace every 'digit' you see with 'character' to make sense of the code. Original captcha this was written for only had digits.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scipy.misc | |
from os import listdir | |
from collections import defaultdict | |
def extract_digits(img): | |
# Threshold color | |
image = [[[255, 255, 255] for cell in row] for row in img] | |
for i, row in enumerate(img): | |
for j, cell in enumerate(row): | |
if 400 < sum(cell) < 570: | |
image[i][j] = [0, 0, 0] | |
# Find contiguous regions | |
flooded = set() | |
digits = [] | |
for y in xrange(len(image[0])): | |
for x in xrange(len(image)): | |
if (x, y) not in flooded: | |
flooded.add((x, y)) | |
region = [(x, y)] | |
pos = 0 | |
while pos < len(region): | |
xx, yy = region[pos] | |
pos += 1 | |
for dx in [-2, -1, 0, 1, 2]: | |
for dy in [-2, -1, 0, 1, 2]: | |
if dx == dy == 0: | |
continue | |
if (0 <= xx + dx < len(image) and | |
0 <= yy + dy < len(image[0]) and | |
tuple(image[xx + dx][yy + dy]) == tuple(image[x][y]) and | |
(xx + dx, yy + dy) not in flooded): | |
region.append((xx + dx, yy + dy)) | |
flooded.add((xx + dx, yy + dy)) | |
# Filter | |
if image[x][y] != [255, 255, 255] and len(region) > 5: | |
digits.append(region) | |
return digits | |
N = 25 | |
def normalized_image(pixels): | |
min_x = min(x for x, y in pixels) | |
min_y = min(y for x, y in pixels) | |
img = [[0 for y in xrange(N)] for x in xrange(N)] | |
for x, y in pixels: | |
img[x - min_x][y - min_y] = 1 | |
return img | |
# Load the labeled captchas and separate them into digits | |
directory = 'captcha-corpus' | |
labeled_digits = [] | |
digit_areas = defaultdict(list) | |
print 'Training' | |
for filename in listdir(directory)[500:1000]: | |
label = filename.split('.')[0] | |
digits = extract_digits(scipy.misc.imread(directory + '/' + filename)) | |
if len(label) != len(digits): | |
print 'skipped labeled image', label | |
continue | |
for i, digit in enumerate(digits): | |
img = normalized_image(digit) | |
labeled_digits.append((label[i], img)) | |
digit_areas[label[i]].append(sum(val for row in img for val in row)) | |
#scipy.misc.imsave('digit_{:s}_{:s}_{:s}.png'.format(label[i], str(i), label), img) | |
average_digit_area = {k : (sum(v) / len(v)) for k, v in digit_areas.iteritems()} | |
def solve_for_image(image): | |
digits = extract_digits(image) | |
guess = '' | |
for digit in digits: | |
img = normalized_image(digit) | |
best = (N ** 2, None) | |
for num, labeled_digit in labeled_digits: | |
dist = sum(img[x][y] != labeled_digit[x][y] for x in xrange(N) for y in xrange(N)) | |
if abs(average_digit_area[num] - len(digit)) < 20: | |
best = min(best, (dist, num)) | |
guess += str(best[1]) | |
return guess | |
def solve_for_filename(filename): | |
return solve_for_image(scipy.misc.imread(filename)) | |
if __name__ == '__main__': | |
print 'Classifying' | |
count = 100 | |
wrong = 0 | |
for filename in listdir(directory)[:count]: | |
label = filename.split('.')[0] | |
predicted = solve_for_filename(directory + '/' + filename) | |
if label != predicted: | |
print predicted, label | |
wrong += 1 | |
else: | |
print predicted | |
print wrong, count, float(wrong) / count | |
# Output: | |
''' | |
Training | |
skipped labeled image flex495 | |
skipped labeled image galil703 | |
skipped labeled image geria229 | |
skipped labeled image germi850 | |
skipped labeled image gibbs901 | |
skipped labeled image heirl278 | |
skipped labeled image hoop19 | |
skipped labeled image house553 | |
skipped labeled image human970 | |
skipped labeled image hyeni569 | |
skipped labeled image inani632 | |
skipped labeled image indep415 | |
skipped labeled image integ467 | |
skipped labeled image inter28 | |
skipped labeled image iodid634 | |
skipped labeled image jacul766 | |
skipped labeled image jujit844 | |
skipped labeled image kimch758 | |
skipped labeled image lean745 | |
skipped labeled image lochi371 | |
skipped labeled image objec165 | |
skipped labeled image orali875 | |
skipped labeled image peaki92 | |
Classifying | |
abate328 | |
abett394 | |
abide670 | |
abort213 | |
acces203 | |
acces252 | |
accou196 | |
accus213 | |
aceph370 | |
aceti517 | |
achen516 | |
acnui284 acqui284 | |
acuui757 acqui757 | |
acrit320 | |
acule603 | |
acupr469 | |
acupu215 | |
acyla825 | |
adjus319 | |
adjus503 | |
admix577 admix572 | |
adsor290 | |
aethe775 | |
afflu996 | |
aflat546 | |
agend64h agend645 | |
agitp88 | |
agrcee712 agree712 | |
agrib896 | |
aibsh534 airsh534 | |
album788 | |
alche973 | |
alcoh292 | |
aldol715 | |
alexa391 | |
alfak517 | |
algit803 algid803 | |
alien756 | |
alkal541 | |
allay633 | |
aller122 | |
allia64 | |
allcit32 allit32 | |
allot410 | |
allox886 | |
almuc251 | |
aluni852 | |
amara709 | |
ambul882 | |
amicda540 amida540 | |
among722 | |
amora5 | |
anago1 | |
anale229 | |
arale459 anale459 | |
anape570 | |
anath743 | |
anhyh81l anhyd811 | |
anima325 | |
aniso450 | |
aniso807 | |
ankle568 | |
antep209 | |
antho948 | |
anthr894 | |
antlp634 antip634 | |
antip878 | |
antis421 | |
antit231 | |
apert699 | |
apica644 | |
appan708 | |
apply523 | |
aprax38 | |
apsid497 | |
aptit976 aptit972 | |
aouat983 aquat983 | |
arbor305 | |
archi14l archi141 | |
arena138 | |
argen369 | |
argil750 | |
jargo822 argo822 | |
argue538 | |
armba582 | |
armfn381 armfu381 | |
aroma802 | |
arpen264 | |
arriv25 | |
arsen500 | |
arter107 | |
ashla667 | |
assai305 | |
assen935 | |
asser551 | |
astra488 | |
ating951 | |
attac495 | |
attai758 | |
attri573 | |
17 100 0.17 | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment