Skip to content

Instantly share code, notes, and snippets.

@itsrifat
Last active August 29, 2015 14:12
Show Gist options
  • Save itsrifat/b3456cc93b156ad59baa to your computer and use it in GitHub Desktop.
Save itsrifat/b3456cc93b156ad59baa to your computer and use it in GitHub Desktop.
captcha cracker challenge in hackerrank
#build a model with your favourite classifier !
from sklearn.linear_model import LogisticRegression
#read the raw training data
data = readData(directory)
#prepare the training data from the raw data
trainingdata = prepareTrainingData(data)
#create the classifier
clf = LogisticRegression()
clf.fit(trainingdata[0], trainingdata[1])
#now the classifier can be used to predict any new captcha!
def prepareTrainingData(data):
train_x = []
train_y = []
#set a threshold for value to remove the background
threshold = 100
for d in data:
img = d[0]
output = list(d[1])
i=0
#cut the image into 5 pices !
startLocX = 7
endLocX = 21
startLocY = 6
increment = 8
for i in range(5):
char = img[startLocX:endLocX,startLocY:startLocY+increment]
#remove the background by thresholding
#turn the background pixels to white
char[char > threshold] = 255
startLocY = startLocY + increment + 1
train_x.append(char.ravel())
train_y.append(output[i])
return (train_x,train_y)
import os
import numpy as np
def readImageFile(imgLoc):
'''
Reads one image file from imgLoc
'''
pixels = []
with open(imgLoc) as f:
next(f)
for line in f:
row = []
for p in line.split(" "):
vals = [int(val) for val in p.split(",")]
row.append(sum(vals)/len(vals))
pixels.append(row)
return np.array(pixels,dtype = np.uint8)
def readData(directory):
'''
Reads all captcha files and their labels from directory. directory should have this structure
as provided by the problem:
-directory
-input
-0.txt
-1.txt
-...
-output
-0.txt
-1.txt
-...
'''
images = []
for root, dirs, files in os.walk(directory+"input/", topdown=False):
files = [fi for fi in files if fi.endswith(".txt")]
for name in files:
outputfileName = "output"+str(name[5:].split(".")[0])+".txt"
imgdata = readImageFile(os.path.join(root, name))
with open(directory+"output/"+outputfileName) as outf:
output = next(outf).strip()
images.append((imgdata,output))
return images
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment