Skip to content

Instantly share code, notes, and snippets.

@random82
Created November 19, 2017 09:39
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save random82/edb3c0738de65a68e685749a6061c348 to your computer and use it in GitHub Desktop.
Save random82/edb3c0738de65a68e685749a6061c348 to your computer and use it in GitHub Desktop.
Basic PDFExtractor for Azure Data Lake Analytics
using System.Collections.Generic;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using Microsoft.Analytics.Interfaces;
namespace PDFExtractor
{
[SqlUserDefinedExtractor(AtomicFileProcessing = true)]
public class PDFExtractor : IExtractor
{
public override IEnumerable<IRow> Extract(IUnstructuredReader input, IUpdatableRow output)
{
var reader = new PdfReader(input.BaseStream);
for (var page = 1; page <= reader.NumberOfPages; page++)
{
output.Set(0, page);
output.Set(1, ExtractText(reader, page));
yield return output.AsReadOnly();
}
}
public string ExtractText(PdfReader pdfReader, int pageNum)
{
var text = PdfTextExtractor.GetTextFromPage(pdfReader, pageNum, new LocationTextExtractionStrategy());
return text.Replace("\r", "\\r").Replace("\n", "\\n");
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment