Created
December 18, 2014 15:01
-
-
Save jittuu/55b6de7ef4f446e396fb to your computer and use it in GitHub Desktop.
CaptchaBreaker using tesseract
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Configuration; | |
using System.Diagnostics; | |
using System.Diagnostics.CodeAnalysis; | |
using System.IO; | |
using System.Linq; | |
using System.Text; | |
using System.Text.RegularExpressions; | |
using System.Threading.Tasks; | |
using ImageProcessor; | |
using ImageProcessor.Imaging.Filters; | |
namespace CaptchaBreaker | |
{ | |
public interface IOcr | |
{ | |
Task<string> ReadAsync(Stream stream); | |
} | |
[ExcludeFromCodeCoverage] | |
public class Tesseract : IOcr | |
{ | |
ProcessStartInfo _psi; | |
bool _digitOnly; | |
bool _convertToGreyScale; | |
public Tesseract(bool digitOnly, bool convertToGreyscale) | |
{ | |
_digitOnly = digitOnly; | |
_convertToGreyScale = convertToGreyscale; | |
_psi = new ProcessStartInfo(GetTesseractPath()); | |
_psi.UseShellExecute = false; | |
_psi.CreateNoWindow = true; | |
_psi.UseShellExecute = false; | |
} | |
[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Design", "CA1031:DoNotCatchGeneralExceptionTypes", Justification = "Set back any thrown exception to TaskCompletionSource.")] | |
public Task<string> ReadAsync(Stream stream) | |
{ | |
var tcs = new TaskCompletionSource<string>(); | |
var p = new Process(); | |
Task.Factory.StartNew(() => | |
{ | |
var imageStream = stream; | |
try | |
{ | |
var path = GetNewFilePath(); | |
string imgPath; | |
if (_convertToGreyScale) | |
{ | |
imageStream = ConvertToGreyScale(stream); | |
//// imageStream = ConvertToGreyscale.Process(stream); | |
} | |
imgPath = imageStream.SaveAsImage(path); | |
_psi.Arguments = string.Format("\"{0}\" \"{1}\"", imgPath, path); | |
if (_digitOnly) | |
{ | |
_psi.Arguments += " digits"; | |
} | |
p.StartInfo = _psi; | |
p.EnableRaisingEvents = true; | |
p.Exited += (_, __) => | |
{ | |
var txt = path + ".txt"; | |
if (File.Exists(txt)) | |
{ | |
var text = File.ReadAllText(txt); | |
tcs.TrySetResult(text); | |
} | |
else | |
{ | |
tcs.TrySetException(new FileNotFoundException(txt)); | |
} | |
}; | |
p.Start(); | |
} | |
catch (Exception ex) | |
{ | |
tcs.TrySetException(ex); | |
} | |
finally | |
{ | |
if (imageStream != stream && imageStream != null) | |
{ | |
imageStream.Dispose(); | |
} | |
if (p != null) | |
{ | |
p.Dispose(); | |
} | |
} | |
}); | |
return tcs.Task; | |
} | |
private static string GetNewFilePath() | |
{ | |
var name = Guid.NewGuid().ToString("D"); | |
var dir = GetOrCreateDirectory(); | |
var basePath = Path.Combine(dir.FullName, name); | |
return basePath; | |
} | |
private static DirectoryInfo GetOrCreateDirectory() | |
{ | |
var path = Path.Combine(ConfigurationManager.AppSettings["ocrtempfolder"], "ocr_temp"); | |
if (!Directory.Exists(path)) | |
{ | |
return Directory.CreateDirectory(path); | |
} | |
return new DirectoryInfo(path); | |
} | |
private static string GetTesseractPath() | |
{ | |
var enviromentPath = System.Environment.GetEnvironmentVariable("PATH"); | |
var paths = enviromentPath.Split(';'); | |
var tesseractPath = paths.Select(x => Path.Combine(x, "tesseract.exe")) | |
.Where(x => File.Exists(x)) | |
.FirstOrDefault(); | |
return tesseractPath; | |
} | |
private static Stream ConvertToGreyScale(Stream stream) | |
{ | |
MemoryStream outStream = null; | |
using (ImageFactory imageFactory = new ImageFactory()) | |
using (MemoryStream inStream = new MemoryStream()) | |
{ | |
stream.CopyTo(inStream); | |
imageFactory.Load(inStream) | |
.Contrast(65) | |
.Brightness(65) | |
.Filter(MatrixFilters.GreyScale) | |
.Format(System.Drawing.Imaging.ImageFormat.Jpeg) | |
.Quality(100) | |
.Save(outStream); | |
return outStream; | |
} | |
} | |
} | |
public interface ICaptchaBreaker | |
{ | |
Task<string> BreakAsync(Stream stream); | |
} | |
public class DefaultCaptchaBreaker : ICaptchaBreaker | |
{ | |
private IOcr _ocr; | |
public bool ConvertToGreyScacle { get; set; } | |
public bool DigitOnly { get; private set; } | |
public int CaptchaLength { get; set; } | |
public DefaultCaptchaBreaker(bool digitOnly, bool convertToGreyscale) | |
: this(digitOnly, new Tesseract(digitOnly, convertToGreyscale)) | |
{ | |
} | |
public DefaultCaptchaBreaker(IOcr ocr) | |
: this(digitOnly: true, ocr: ocr) | |
{ | |
} | |
public DefaultCaptchaBreaker(bool digitOnly, IOcr ocr) | |
{ | |
this.DigitOnly = digitOnly; | |
this._ocr = ocr; | |
this.CaptchaLength = 4; | |
this.ConvertToGreyScacle = false; | |
} | |
public async Task<string> BreakAsync(Stream stream) | |
{ | |
string captcha = null; | |
try | |
{ | |
captcha = await _ocr.ReadAsync(stream).ConfigureAwait(false); | |
captcha = captcha.Replace("\n", "").Trim(); | |
} | |
catch (Exception ex) | |
{ | |
throw new InvalidCaptchaException("There's an error in breaking the captcha. Please see inner exception.", ex); | |
} | |
if (this.DigitOnly) | |
{ | |
int captchaAsDigit; | |
captcha = Regex.Replace(captcha, @"[^\d]", ""); | |
if (!int.TryParse(captcha, out captchaAsDigit)) | |
{ | |
throw new InvalidCaptchaException(string.Format("Invalid captcha format! Captcha={0}", captcha)); | |
} | |
} | |
if (captcha.Length != this.CaptchaLength) | |
{ | |
throw new InvalidCaptchaException(string.Format("Invalid captcha format! Captcha={0}", captcha)); | |
} | |
return captcha; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment