Skip to content

Instantly share code, notes, and snippets.

@matthewjberger
Last active March 14, 2018 00:29
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save matthewjberger/f4f6fc5e56ec78057949bf8c76fbf977 to your computer and use it in GitHub Desktop.
Save matthewjberger/f4f6fc5e56ec78057949bf8c76fbf977 to your computer and use it in GitHub Desktop.
Extract Image from pdf
using System;
using System.Collections.Generic;
using System.Drawing;
using System.IO;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using Tesseract;
using System.Drawing.Imaging;
namespace ExtractInvoice
{
class Program
{
// The header ends at 1600 pixels on all invoices
private const int headerHeight_ = 1600;
static void Main(string[] args)
{
try
{
string tessDataPath =
@"C:\Users\Berger_MA\Documents\visual studio 2015\Projects\ExtractInvoice\tessdata";
SaveImages(@"C:\Users\Berger_MA\Downloads\Invoices\a.pdf", @"C:\Users\Berger_MA\Downloads\Invoices\Images");
SaveImages(@"C:\Users\Berger_MA\Downloads\Invoices\b.pdf", @"C:\Users\Berger_MA\Downloads\Invoices\Images");
SaveImages(@"C:\Users\Berger_MA\Downloads\Invoices\c.pdf", @"C:\Users\Berger_MA\Downloads\Invoices\Images");
ExtractText(@"C:\Users\Berger_MA\Downloads\Invoices\Images\c0.jpg", tessDataPath);
}
catch (Exception ex)
{
System.Console.WriteLine(ex.Message);
}
}
#region Methods
/// <summary>
/// Extract all images from a pdf, and store them in a list of Images.
/// </summary>
/// <param name="PDFSourcePath">Specify PDF Source Path</param>
/// <returns>List</returns>
private static List<System.Drawing.Image> ExtractImages(String PDFSourcePath)
{
List<System.Drawing.Image> ImgList = new List<System.Drawing.Image>();
try
{
RandomAccessFileOrArray RAFObj = null;
PdfReader PDFReaderObj = null;
PdfObject PDFObj = null;
PdfStream PDFStremObj = null;
RAFObj = new RandomAccessFileOrArray(PDFSourcePath);
PDFReaderObj = new PdfReader(RAFObj, null);
for (int i = 0; i <= PDFReaderObj.XrefSize - 1; i++)
{
PDFObj = PDFReaderObj.GetPdfObject(i);
if ((PDFObj != null) && PDFObj.IsStream())
{
PDFStremObj = (PdfStream)PDFObj;
PdfObject subtype = PDFStremObj.Get(PdfName.SUBTYPE);
if ((subtype != null) && subtype.ToString() == PdfName.IMAGE.ToString())
{
try
{
PdfImageObject PdfImageObj =
new PdfImageObject((PRStream)PDFStremObj);
System.Drawing.Image ImgPDF = PdfImageObj.GetDrawingImage();
ImgList.Add(ImgPDF);
}
catch (Exception) { /* Fail silently */ }
}
}
}
PDFReaderObj.Close();
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
return ImgList;
}
/// <summary>
/// Extracts images from a pdf, and saves them to a file.
/// </summary>
private static void SaveImages(string pathToPdf, string outputPath)
{
try
{
string name = System.IO.Path.GetFileNameWithoutExtension(pathToPdf);
if (!Directory.Exists(outputPath)) Directory.CreateDirectory(outputPath);
// Get a List of Image
List<System.Drawing.Image> ListImage = ExtractImages(pathToPdf);
for (int i = 0; i < ListImage.Count; i++)
{
try
{
string currentName = name + i + ".jpg";
Bitmap bmpImage = new Bitmap(ListImage[i]);
// White out logo
using (Graphics graphics = Graphics.FromImage(bmpImage))
{
graphics.FillRectangle(new SolidBrush(Color.White), 0, 0, 930, 540);
}
bmpImage = Sharpen(bmpImage);
// Crop the image
Rectangle cropRect = new Rectangle();
cropRect.X = 0;
cropRect.Y = 0;
cropRect.Width = bmpImage.Width;
cropRect.Height = headerHeight_;
Image croppedimage = bmpImage.Clone(cropRect, bmpImage.PixelFormat);
// Save the image to a file
croppedimage.Save(System.IO.Path.Combine(outputPath, currentName), System.Drawing.Imaging.ImageFormat.Jpeg);
}
catch (Exception)
{ /* Fail silently and continue */ }
}
}
catch (Exception ex)
{
throw new Exception(ex.Message);
}
}
/// <summary>
/// Extracts all the text from an image
/// </summary>
/// <param name="pathToImage">The path to the image to extract text from.</param>
/// <returns>The extracted text</returns>
private static string ExtractText(string pathToImage, string tessDataPath)
{
try
{
// Creating the tesseract OCR engine with English as the language
using (var tEngine = new TesseractEngine(tessDataPath, "eng", EngineMode.Default))
{
tEngine.SetVariable("tessedit_char_whitelist", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ012345789,-/@");
using (var img = Pix.LoadFromFile(pathToImage)) // Load of the image file from the Pix object which is a wrapper for Leptonica PIX structure
{
using (var page = tEngine.Process(img)) //process the specified image
{
var text = page.GetText(); //Gets the image's content as plain text.
Console.WriteLine(text); //display the text
Console.WriteLine(page.GetMeanConfidence()); //Get's the mean confidence that as a percentage of the recognized text.
Console.ReadKey();
}
}
}
}
catch (Exception e)
{
Console.WriteLine("Unexpected Error: " + e.Message);
}
return "";
}
public static Bitmap Sharpen(Bitmap image)
{
Bitmap sharpenImage = (Bitmap)image.Clone();
int filterWidth = 3;
int filterHeight = 3;
int width = image.Width;
int height = image.Height;
// Create sharpening filter.
double[,] filter = new double[filterWidth, filterHeight];
filter[0, 0] = filter[0, 1] = filter[0, 2] = filter[1, 0] = filter[1, 2] = filter[2, 0] = filter[2, 1] = filter[2, 2] = -1;
filter[1, 1] = 9;
double factor = 1.0;
double bias = 0.0;
Color[,] result = new Color[image.Width, image.Height];
// Lock image bits for read/write.
BitmapData pbits = sharpenImage.LockBits(new Rectangle(0, 0, width, height), ImageLockMode.ReadWrite, PixelFormat.Format24bppRgb);
// Declare an array to hold the bytes of the bitmap.
int bytes = pbits.Stride * height;
byte[] rgbValues = new byte[bytes];
// Copy the RGB values into the array.
System.Runtime.InteropServices.Marshal.Copy(pbits.Scan0, rgbValues, 0, bytes);
int rgb;
// Fill the color array with the new sharpened color values.
for (int x = 0; x < width; ++x)
{
for (int y = 0; y < height; ++y)
{
double red = 0.0, green = 0.0, blue = 0.0;
for (int filterX = 0; filterX < filterWidth; filterX++)
{
for (int filterY = 0; filterY < filterHeight; filterY++)
{
int imageX = (x - filterWidth / 2 + filterX + width) % width;
int imageY = (y - filterHeight / 2 + filterY + height) % height;
rgb = imageY * pbits.Stride + 3 * imageX;
red += rgbValues[rgb + 2] * filter[filterX, filterY];
green += rgbValues[rgb + 1] * filter[filterX, filterY];
blue += rgbValues[rgb + 0] * filter[filterX, filterY];
}
int r = Math.Min(Math.Max((int)(factor * red + bias), 0), 255);
int g = Math.Min(Math.Max((int)(factor * green + bias), 0), 255);
int b = Math.Min(Math.Max((int)(factor * blue + bias), 0), 255);
result[x, y] = Color.FromArgb(r, g, b);
}
}
}
// Update the image with the sharpened pixels.
for (int x = 0; x < width; ++x)
{
for (int y = 0; y < height; ++y)
{
rgb = y * pbits.Stride + 3 * x;
rgbValues[rgb + 2] = result[x, y].R;
rgbValues[rgb + 1] = result[x, y].G;
rgbValues[rgb + 0] = result[x, y].B;
}
}
// Copy the RGB values back to the bitmap.
System.Runtime.InteropServices.Marshal.Copy(rgbValues, 0, pbits.Scan0, bytes);
// Release image bits.
sharpenImage.UnlockBits(pbits);
return sharpenImage;
}
#endregion
}
}
@MarceloNascimento
Copy link

MarceloNascimento commented Mar 14, 2018

Hi @matthewjberger which version of library ItextSharp are you using ?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment