Skip to content

Instantly share code, notes, and snippets.

@jhgbrt
Created January 25, 2022 19:42
Show Gist options
  • Save jhgbrt/f161badbfdd7c73fb544cdef81e1c32f to your computer and use it in GitHub Desktop.
Save jhgbrt/f161badbfdd7c73fb544cdef81e1c32f to your computer and use it in GitHub Desktop.
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Listener;
using Net.Code.Csv;
using System.Globalization;
using System.Text.RegularExpressions;
var folder = args[0];
var file = args[1];
var cultureInfo = CultureInfo.GetCultureInfo("nl-BE");
WriteCsv.ToFile(ExtractTransactions(folder, cultureInfo),
Path.Combine(folder, file), delimiter: ';', hasHeaders: true, cultureInfo: cultureInfo
);
static IEnumerable<Transaction> ExtractTransactions(string folder, CultureInfo cultureInfo)
=> from file in Directory.GetFiles(folder, "*.pdf")
from line in ExtractLines(file)
let tx = Transaction.Parse(line, cultureInfo)
where tx != null
select tx.Value;
static IEnumerable<string> ExtractLines(string file)
{
using var pdfReader = new PdfReader(file);
using var pdfDoc = new PdfDocument(pdfReader);
for (int page = 1; page <= pdfDoc.GetNumberOfPages(); page++)
{
var strategy = new SimpleTextExtractionStrategy();
var pageContent = PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(page), strategy);
using var stringReader = new StringReader(pageContent);
while (stringReader.Peek() >= 0)
{
yield return stringReader.ReadLine()!;
}
}
}
readonly record struct Transaction([CsvFormat("yyyy-MM-dd")]DateTime Datum, string Omschrijving, decimal Bedrag)
{
private static readonly Regex regex = new (@"(?<date>\d\d/\d\d/\d\d\d\d) (?<description>.*) (?<amount>-?[.\d]+,\d{2})$");
public static Transaction? Parse(string s, CultureInfo cultureInfo)
{
var m = regex.Match(s);
return m.Success
? new Transaction(
DateTime.ParseExact(m.Groups["date"].Value, "dd/MM/yyyy", cultureInfo, DateTimeStyles.None),
m.Groups["description"].Value,
decimal.Parse(m.Groups["amount"].Value, cultureInfo)
)
: null;
}
}
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="itext7" Version="7.2.1" />
<PackageReference Include="Net.Code.Csv" Version="5.0.0-preview.1" />
</ItemGroup>
</Project>
@jhgbrt
Copy link
Author

jhgbrt commented Jan 25, 2022

parses all pdf files in a folder, extracts lines of the form [dd/MM/yyyy] [description] [amount]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment