Last active
May 20, 2021 09:42
-
-
Save Beej126/5a9337c74f6066b328ac24c46a2a7e7a to your computer and use it in GitHub Desktop.
Tesseract OCR on inbound PDF implemented in .Net, structured as an HTTP request Azure Function, uploads results to SQL Server
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#r "iTextSharp.dll" | |
#r "Ghostscript.Net.dll" | |
#r "System.Drawing" | |
#r "Tesseract.dll" | |
#r "SqlClientHelpers.dll" | |
#r "iTextSharp.Licensekey.dll" | |
#r "System.Data" | |
#r "System.Configuration" | |
using System; | |
using System.Net; | |
using System.Net.Http; | |
using iTextSharp.text.pdf; | |
using iTextSharp.text; | |
using System.IO; | |
using Ghostscript.NET; | |
using Ghostscript.NET.Rasterizer; | |
using System.Drawing; | |
using Tesseract; | |
using NextTech.SqlClientHelpers; | |
using System.Data; | |
using System.Text; | |
using System.Text.RegularExpressions; | |
using System.Diagnostics; | |
public static GhostscriptRasterizer rasterizer = new GhostscriptRasterizer(); | |
public static GhostscriptVersionInfo gsVersion = new GhostscriptVersionInfo(@"D:\home\site\wwwroot\BlendTicketInvoicePDFScrape\bin\gsdll32.dll"); | |
public static TesseractEngine ocr = new TesseractEngine(@"D:\home\site\wwwroot\BlendTicketInvoicePDFScrape\tessdata", "eng", EngineMode.Default); | |
public static int currentLineNum; | |
public static string[] lines; | |
public static TraceWriter gLog; | |
public const string topString = "-----top-----top-----top-----"; | |
public static string findMatch(string pattern) | |
{ | |
Match match = null; | |
while (currentLineNum < lines.Count() | |
&& (lines[currentLineNum] != topString || pattern == "("+topString+")") | |
&& (match = Regex.Match(lines[currentLineNum], pattern)) != null | |
&& !match.Success) currentLineNum++; | |
var found = (match !=null && match.Success) ? match.Groups[1].Value : null; | |
if (found != null) currentLineNum++; //move to the next line to start the next search | |
gLog.Info($"looking for: {pattern}, found: {found ?? "NULL"}, current line: {currentLineNum}"); | |
return found; | |
} | |
public static object v(string str, bool tryAsDecimal = false) { | |
decimal x; | |
return string.IsNullOrWhiteSpace(str) ? DBNull.Value : | |
tryAsDecimal ? (decimal.TryParse(str, out x) ? (object)x : DBNull.Value) : | |
(object)str; | |
} | |
public static async Task<HttpResponseMessage> Run(HttpRequestMessage req, TraceWriter log) | |
{ | |
gLog = log; | |
Proc.ConnectionStringDefault = System.Configuration.ConfigurationManager.ConnectionStrings["TestSQL"].ConnectionString; | |
Proc.OnErrorDefault = (ex) => log.Info($"sql error: {ex}"); | |
Proc.OnSuccessDefault = () => log.Info($"sql Success!"); | |
try | |
{ | |
log.Info($"C# HTTP trigger function processed a request. Current Folder: {System.Environment.CurrentDirectory}"); | |
var currentPageNum = 2; | |
var skipPageCount = 3; | |
int dpi = 200; | |
StringBuilder pageText = new StringBuilder(); | |
var pageCount = 0; | |
MemoryStream ghostScriptInStream; | |
using (var incomingPdfStream = await req.Content.ReadAsStreamAsync()) | |
//using (var pdfIn = new PdfReader("LPI-Billing11-25.pdf")) | |
using (var pdfIn = new PdfReader(incomingPdfStream)) | |
using (var pdfExtractDoc = new Document(pdfIn.GetPageSizeWithRotation(currentPageNum))) | |
using (var pdfExtractStream = new MemoryStream()) | |
using (var pdfExtract = new PdfCopy(pdfExtractDoc, pdfExtractStream)) | |
//using (var pdfExtract = new PdfCopy(pdfExtractDoc, new System.IO.FileStream("out.pdf", System.IO.FileMode.Create))) | |
{ | |
var totalPageCount = pdfIn.NumberOfPages; | |
pdfExtractDoc.Open(); | |
while (currentPageNum < totalPageCount) | |
{ | |
pageCount++; | |
pdfIn.GetPageN(currentPageNum).Put(PdfName.ROTATE, new PdfNumber(-90)); | |
pdfExtract.AddPage(pdfExtract.GetImportedPage(pdfIn, currentPageNum)); | |
currentPageNum += skipPageCount; | |
log.Info($"{pageCount} pages rotated and extracted..."); | |
} | |
//had to load result into fresh stream, otherwise PDF structure was scrambled using the old stream for some reason | |
ghostScriptInStream = new MemoryStream(pdfExtractStream.GetBuffer()); | |
} | |
using (ghostScriptInStream) | |
{ | |
for (int currentOcrPage = 1; currentOcrPage <= pageCount; currentOcrPage++) | |
{ | |
rasterizer.Open(ghostScriptInStream, gsVersion, true); | |
log.Info($"saving page {currentOcrPage} to BMP (\"rasterizing\")..."); | |
using (var img = rasterizer.GetPage(dpi, dpi, currentOcrPage)) | |
using (var bmp = new Bitmap(img)) | |
{ | |
log.Info($" OCR'ing..."); | |
using (var ocrPage = ocr.Process(bmp)) { | |
var pgTxt = ocrPage.GetText(); | |
log.Info(pgTxt); | |
pageText.Append(topString); | |
pageText.Append("\n"); | |
pageText.Append(pgTxt); | |
} | |
} | |
log.Info($"{currentOcrPage} pages OCRed..."); | |
} | |
} | |
lines = pageText.ToString().Split('\n'); | |
log.Info($"Proc.ConnectionStringDefault: {Proc.ConnectionStringDefault}"); | |
using (var selectProc = new Proc("dbo.BlendTicketInvoiceItems_s")) | |
using (var insertProc = new Proc("dbo.BlendTicketInvoiceItems_i")) | |
{ | |
var sqlProcessedPageCount = 0; | |
log.Info("firing select proc..."); | |
var sqlTable = selectProc.ExecuteDataSet().Table0(); | |
log.Info("debug 2"); | |
currentLineNum = 0; | |
while (true) | |
{ | |
sqlProcessedPageCount++; | |
var top = findMatch("("+topString+")"); | |
if (top == null) break; | |
var currentTop = currentLineNum; | |
var blendTicketNumber = findMatch(@"Blend Ticket ([0-9]+)"); | |
if (blendTicketNumber == null) { | |
log.Info($"*** couldn't find blendTicketNumber on bundle #{sqlProcessedPageCount}, bailing on to next."); | |
currentLineNum--; | |
continue; | |
} | |
currentLineNum = currentTop; | |
findMatch("(Guaranteed Anal)"); | |
var mixRatio = lines[currentLineNum].Trim(); | |
var TotalPrice = findMatch(@"PRICE . ([0-9]*\.[0-9]*)"); | |
var TotalCost = findMatch(@"COST . ([0-9]*\.[0-9]*)"); | |
var TotalFreight = findMatch(@"FREIGHT . ([0-9]*\.[0-9]*)"); | |
currentLineNum = currentTop; | |
while (!lines[currentLineNum].StartsWith("Liqlmm")) currentLineNum++; | |
while (lines[currentLineNum].StartsWith("Liq")) | |
{ | |
var groups = Regex.Match(lines[currentLineNum], @"(?<Header>[^0-9]*?) (?<ItemId>[0-9]*?) (.*?) ([^ ]*?\..*?) ([^ ]*?\..*?) (?<Cost>[^ ]*?\..*?) ").Groups; | |
var itemId = groups["ItemId"]?.Value; | |
var header = groups["Header"]?.Value; | |
//little fix for this bad data scenario: | |
//bad: Liqlmm 14001 Hot Water Tons 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 | |
//good: Liqlmm I 1001 MOP 00-52 Tons 404.7300 252.8210 261.7000 151.9090 143.0300 37.5334 35.3396 60.0356 54.6542 | |
// 12345678 | |
//itemId needs to be 4001 on first line, not 14001 | |
if (header.Length < 8 && itemId != null) itemId = itemId.Left(-1); | |
var r = sqlTable.NewRow(); | |
r["BlendTicketNumber"] = blendTicketNumber; | |
r["MixRatio"] = v(mixRatio); | |
r["ItemId"] = v(itemId); | |
r["Cost"] = v(groups["Cost"]?.Value, true); | |
r["TotalPrice"] = v(TotalPrice, true); | |
r["TotalCost"] = v(TotalCost, true); | |
r["TotalFreight"] = v(TotalFreight, true); | |
sqlTable.Rows.Add(r); | |
currentLineNum++; | |
log.Info($"added {sqlTable.Rows.Count} sqlTable row, currentLineNum: {currentLineNum}"); | |
} | |
} | |
insertProc["@BlendTicketInvoiceItems"] = sqlTable; | |
insertProc.ExecuteNonQueryAsync().Wait(); | |
return req.CreateResponse(HttpStatusCode.OK, $"successfully inserted rows: {sqlTable.Rows.Count}"); | |
} | |
} | |
catch (Exception ex) | |
{ | |
return req.CreateResponse(HttpStatusCode.OK, $"Exception: {ex}"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment