Skip to content

Instantly share code, notes, and snippets.

@justinAurand
Last active December 21, 2015 10:59
Show Gist options
  • Save justinAurand/6296086 to your computer and use it in GitHub Desktop.
Save justinAurand/6296086 to your computer and use it in GitHub Desktop.
Run regex on PDF file. Return results to console.
// Credit: http://stackoverflow.com/questions/17601176/read-a-pdf-and-find-a-specific-column-to-add-to-a-list
// You'll need to download the iTextSharp dll and add a reference to it.
using System;
using System.IO;
using System.Text;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
class RegexOnPDF
{
public static void Main()
{
string pdfText = ReadPdfFile(@"C:\SomePDF.pdf");
Regex regex = new Regex(@"Policy Number: (?<number>CSA\d{8})");
foreach (Match match in regex.Matches(pdfText))
Console.WriteLine(match.Groups["number"].Value);
Console.WriteLine("Complete.");
Console.ReadKey();
}
public static string ReadPdfFile(string fileName)
{
var stringBuilder = new StringBuilder();
if (File.Exists(fileName))
{
var pdfReader = new PdfReader(fileName);
for (int page = 1; page <= pdfReader.NumberOfPages; page++)
{
ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
string currentText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText)));
stringBuilder.Append(currentText);
}
pdfReader.Close();
}
return stringBuilder.ToString();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment