Skip to content

Instantly share code, notes, and snippets.

@jittuu
Created December 18, 2014 15:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jittuu/55b6de7ef4f446e396fb to your computer and use it in GitHub Desktop.
Save jittuu/55b6de7ef4f446e396fb to your computer and use it in GitHub Desktop.
CaptchaBreaker using tesseract
using System;
using System.Collections.Generic;
using System.Configuration;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using ImageProcessor;
using ImageProcessor.Imaging.Filters;
namespace CaptchaBreaker
{
public interface IOcr
{
Task<string> ReadAsync(Stream stream);
}
[ExcludeFromCodeCoverage]
public class Tesseract : IOcr
{
ProcessStartInfo _psi;
bool _digitOnly;
bool _convertToGreyScale;
public Tesseract(bool digitOnly, bool convertToGreyscale)
{
_digitOnly = digitOnly;
_convertToGreyScale = convertToGreyscale;
_psi = new ProcessStartInfo(GetTesseractPath());
_psi.UseShellExecute = false;
_psi.CreateNoWindow = true;
_psi.UseShellExecute = false;
}
[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Design", "CA1031:DoNotCatchGeneralExceptionTypes", Justification = "Set back any thrown exception to TaskCompletionSource.")]
public Task<string> ReadAsync(Stream stream)
{
var tcs = new TaskCompletionSource<string>();
var p = new Process();
Task.Factory.StartNew(() =>
{
var imageStream = stream;
try
{
var path = GetNewFilePath();
string imgPath;
if (_convertToGreyScale)
{
imageStream = ConvertToGreyScale(stream);
//// imageStream = ConvertToGreyscale.Process(stream);
}
imgPath = imageStream.SaveAsImage(path);
_psi.Arguments = string.Format("\"{0}\" \"{1}\"", imgPath, path);
if (_digitOnly)
{
_psi.Arguments += " digits";
}
p.StartInfo = _psi;
p.EnableRaisingEvents = true;
p.Exited += (_, __) =>
{
var txt = path + ".txt";
if (File.Exists(txt))
{
var text = File.ReadAllText(txt);
tcs.TrySetResult(text);
}
else
{
tcs.TrySetException(new FileNotFoundException(txt));
}
};
p.Start();
}
catch (Exception ex)
{
tcs.TrySetException(ex);
}
finally
{
if (imageStream != stream && imageStream != null)
{
imageStream.Dispose();
}
if (p != null)
{
p.Dispose();
}
}
});
return tcs.Task;
}
private static string GetNewFilePath()
{
var name = Guid.NewGuid().ToString("D");
var dir = GetOrCreateDirectory();
var basePath = Path.Combine(dir.FullName, name);
return basePath;
}
private static DirectoryInfo GetOrCreateDirectory()
{
var path = Path.Combine(ConfigurationManager.AppSettings["ocrtempfolder"], "ocr_temp");
if (!Directory.Exists(path))
{
return Directory.CreateDirectory(path);
}
return new DirectoryInfo(path);
}
private static string GetTesseractPath()
{
var enviromentPath = System.Environment.GetEnvironmentVariable("PATH");
var paths = enviromentPath.Split(';');
var tesseractPath = paths.Select(x => Path.Combine(x, "tesseract.exe"))
.Where(x => File.Exists(x))
.FirstOrDefault();
return tesseractPath;
}
private static Stream ConvertToGreyScale(Stream stream)
{
MemoryStream outStream = null;
using (ImageFactory imageFactory = new ImageFactory())
using (MemoryStream inStream = new MemoryStream())
{
stream.CopyTo(inStream);
imageFactory.Load(inStream)
.Contrast(65)
.Brightness(65)
.Filter(MatrixFilters.GreyScale)
.Format(System.Drawing.Imaging.ImageFormat.Jpeg)
.Quality(100)
.Save(outStream);
return outStream;
}
}
}
public interface ICaptchaBreaker
{
Task<string> BreakAsync(Stream stream);
}
public class DefaultCaptchaBreaker : ICaptchaBreaker
{
private IOcr _ocr;
public bool ConvertToGreyScacle { get; set; }
public bool DigitOnly { get; private set; }
public int CaptchaLength { get; set; }
public DefaultCaptchaBreaker(bool digitOnly, bool convertToGreyscale)
: this(digitOnly, new Tesseract(digitOnly, convertToGreyscale))
{
}
public DefaultCaptchaBreaker(IOcr ocr)
: this(digitOnly: true, ocr: ocr)
{
}
public DefaultCaptchaBreaker(bool digitOnly, IOcr ocr)
{
this.DigitOnly = digitOnly;
this._ocr = ocr;
this.CaptchaLength = 4;
this.ConvertToGreyScacle = false;
}
public async Task<string> BreakAsync(Stream stream)
{
string captcha = null;
try
{
captcha = await _ocr.ReadAsync(stream).ConfigureAwait(false);
captcha = captcha.Replace("\n", "").Trim();
}
catch (Exception ex)
{
throw new InvalidCaptchaException("There's an error in breaking the captcha. Please see inner exception.", ex);
}
if (this.DigitOnly)
{
int captchaAsDigit;
captcha = Regex.Replace(captcha, @"[^\d]", "");
if (!int.TryParse(captcha, out captchaAsDigit))
{
throw new InvalidCaptchaException(string.Format("Invalid captcha format! Captcha={0}", captcha));
}
}
if (captcha.Length != this.CaptchaLength)
{
throw new InvalidCaptchaException(string.Format("Invalid captcha format! Captcha={0}", captcha));
}
return captcha;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment