Skip to content

Instantly share code, notes, and snippets.

@Beej126
Last active May 20, 2021 09:42
Show Gist options
  • Save Beej126/5a9337c74f6066b328ac24c46a2a7e7a to your computer and use it in GitHub Desktop.
Save Beej126/5a9337c74f6066b328ac24c46a2a7e7a to your computer and use it in GitHub Desktop.
Tesseract OCR on inbound PDF implemented in .Net, structured as an HTTP request Azure Function, uploads results to SQL Server
#r "iTextSharp.dll"
#r "Ghostscript.Net.dll"
#r "System.Drawing"
#r "Tesseract.dll"
#r "SqlClientHelpers.dll"
#r "iTextSharp.Licensekey.dll"
#r "System.Data"
#r "System.Configuration"
using System;
using System.Net;
using System.Net.Http;
using iTextSharp.text.pdf;
using iTextSharp.text;
using System.IO;
using Ghostscript.NET;
using Ghostscript.NET.Rasterizer;
using System.Drawing;
using Tesseract;
using NextTech.SqlClientHelpers;
using System.Data;
using System.Text;
using System.Text.RegularExpressions;
using System.Diagnostics;
public static GhostscriptRasterizer rasterizer = new GhostscriptRasterizer();
public static GhostscriptVersionInfo gsVersion = new GhostscriptVersionInfo(@"D:\home\site\wwwroot\BlendTicketInvoicePDFScrape\bin\gsdll32.dll");
public static TesseractEngine ocr = new TesseractEngine(@"D:\home\site\wwwroot\BlendTicketInvoicePDFScrape\tessdata", "eng", EngineMode.Default);
public static int currentLineNum;
public static string[] lines;
public static TraceWriter gLog;
public const string topString = "-----top-----top-----top-----";
public static string findMatch(string pattern)
{
Match match = null;
while (currentLineNum < lines.Count()
&& (lines[currentLineNum] != topString || pattern == "("+topString+")")
&& (match = Regex.Match(lines[currentLineNum], pattern)) != null
&& !match.Success) currentLineNum++;
var found = (match !=null && match.Success) ? match.Groups[1].Value : null;
if (found != null) currentLineNum++; //move to the next line to start the next search
gLog.Info($"looking for: {pattern}, found: {found ?? "NULL"}, current line: {currentLineNum}");
return found;
}
public static object v(string str, bool tryAsDecimal = false) {
decimal x;
return string.IsNullOrWhiteSpace(str) ? DBNull.Value :
tryAsDecimal ? (decimal.TryParse(str, out x) ? (object)x : DBNull.Value) :
(object)str;
}
public static async Task<HttpResponseMessage> Run(HttpRequestMessage req, TraceWriter log)
{
gLog = log;
Proc.ConnectionStringDefault = System.Configuration.ConfigurationManager.ConnectionStrings["TestSQL"].ConnectionString;
Proc.OnErrorDefault = (ex) => log.Info($"sql error: {ex}");
Proc.OnSuccessDefault = () => log.Info($"sql Success!");
try
{
log.Info($"C# HTTP trigger function processed a request. Current Folder: {System.Environment.CurrentDirectory}");
var currentPageNum = 2;
var skipPageCount = 3;
int dpi = 200;
StringBuilder pageText = new StringBuilder();
var pageCount = 0;
MemoryStream ghostScriptInStream;
using (var incomingPdfStream = await req.Content.ReadAsStreamAsync())
//using (var pdfIn = new PdfReader("LPI-Billing11-25.pdf"))
using (var pdfIn = new PdfReader(incomingPdfStream))
using (var pdfExtractDoc = new Document(pdfIn.GetPageSizeWithRotation(currentPageNum)))
using (var pdfExtractStream = new MemoryStream())
using (var pdfExtract = new PdfCopy(pdfExtractDoc, pdfExtractStream))
//using (var pdfExtract = new PdfCopy(pdfExtractDoc, new System.IO.FileStream("out.pdf", System.IO.FileMode.Create)))
{
var totalPageCount = pdfIn.NumberOfPages;
pdfExtractDoc.Open();
while (currentPageNum < totalPageCount)
{
pageCount++;
pdfIn.GetPageN(currentPageNum).Put(PdfName.ROTATE, new PdfNumber(-90));
pdfExtract.AddPage(pdfExtract.GetImportedPage(pdfIn, currentPageNum));
currentPageNum += skipPageCount;
log.Info($"{pageCount} pages rotated and extracted...");
}
//had to load result into fresh stream, otherwise PDF structure was scrambled using the old stream for some reason
ghostScriptInStream = new MemoryStream(pdfExtractStream.GetBuffer());
}
using (ghostScriptInStream)
{
for (int currentOcrPage = 1; currentOcrPage <= pageCount; currentOcrPage++)
{
rasterizer.Open(ghostScriptInStream, gsVersion, true);
log.Info($"saving page {currentOcrPage} to BMP (\"rasterizing\")...");
using (var img = rasterizer.GetPage(dpi, dpi, currentOcrPage))
using (var bmp = new Bitmap(img))
{
log.Info($" OCR'ing...");
using (var ocrPage = ocr.Process(bmp)) {
var pgTxt = ocrPage.GetText();
log.Info(pgTxt);
pageText.Append(topString);
pageText.Append("\n");
pageText.Append(pgTxt);
}
}
log.Info($"{currentOcrPage} pages OCRed...");
}
}
lines = pageText.ToString().Split('\n');
log.Info($"Proc.ConnectionStringDefault: {Proc.ConnectionStringDefault}");
using (var selectProc = new Proc("dbo.BlendTicketInvoiceItems_s"))
using (var insertProc = new Proc("dbo.BlendTicketInvoiceItems_i"))
{
var sqlProcessedPageCount = 0;
log.Info("firing select proc...");
var sqlTable = selectProc.ExecuteDataSet().Table0();
log.Info("debug 2");
currentLineNum = 0;
while (true)
{
sqlProcessedPageCount++;
var top = findMatch("("+topString+")");
if (top == null) break;
var currentTop = currentLineNum;
var blendTicketNumber = findMatch(@"Blend Ticket ([0-9]+)");
if (blendTicketNumber == null) {
log.Info($"*** couldn't find blendTicketNumber on bundle #{sqlProcessedPageCount}, bailing on to next.");
currentLineNum--;
continue;
}
currentLineNum = currentTop;
findMatch("(Guaranteed Anal)");
var mixRatio = lines[currentLineNum].Trim();
var TotalPrice = findMatch(@"PRICE . ([0-9]*\.[0-9]*)");
var TotalCost = findMatch(@"COST . ([0-9]*\.[0-9]*)");
var TotalFreight = findMatch(@"FREIGHT . ([0-9]*\.[0-9]*)");
currentLineNum = currentTop;
while (!lines[currentLineNum].StartsWith("Liqlmm")) currentLineNum++;
while (lines[currentLineNum].StartsWith("Liq"))
{
var groups = Regex.Match(lines[currentLineNum], @"(?<Header>[^0-9]*?) (?<ItemId>[0-9]*?) (.*?) ([^ ]*?\..*?) ([^ ]*?\..*?) (?<Cost>[^ ]*?\..*?) ").Groups;
var itemId = groups["ItemId"]?.Value;
var header = groups["Header"]?.Value;
//little fix for this bad data scenario:
//bad: Liqlmm 14001 Hot Water Tons 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
//good: Liqlmm I 1001 MOP 00-52 Tons 404.7300 252.8210 261.7000 151.9090 143.0300 37.5334 35.3396 60.0356 54.6542
// 12345678
//itemId needs to be 4001 on first line, not 14001
if (header.Length < 8 && itemId != null) itemId = itemId.Left(-1);
var r = sqlTable.NewRow();
r["BlendTicketNumber"] = blendTicketNumber;
r["MixRatio"] = v(mixRatio);
r["ItemId"] = v(itemId);
r["Cost"] = v(groups["Cost"]?.Value, true);
r["TotalPrice"] = v(TotalPrice, true);
r["TotalCost"] = v(TotalCost, true);
r["TotalFreight"] = v(TotalFreight, true);
sqlTable.Rows.Add(r);
currentLineNum++;
log.Info($"added {sqlTable.Rows.Count} sqlTable row, currentLineNum: {currentLineNum}");
}
}
insertProc["@BlendTicketInvoiceItems"] = sqlTable;
insertProc.ExecuteNonQueryAsync().Wait();
return req.CreateResponse(HttpStatusCode.OK, $"successfully inserted rows: {sqlTable.Rows.Count}");
}
}
catch (Exception ex)
{
return req.CreateResponse(HttpStatusCode.OK, $"Exception: {ex}");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment