Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@OlafD
Created April 18, 2019 08:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save OlafD/c06b8fff1e1b5c194c2c6577247e9765 to your computer and use it in GitHub Desktop.
Save OlafD/c06b8fff1e1b5c194c2c6577247e9765 to your computer and use it in GitHub Desktop.
Very simple proof of concept for indexing pdf files iin SharePoint online that are results of scanned content and contain mainly images. This sample uses IronOcr to get the text content of the file.
<Field
ID="{1012e6c5-2eb5-44cf-9f73-1ed8a584d38b}"
Name="DocumentTextFromOCR"
DisplayName="Document Text from OCR"
Description=""
StaticName="DocumentTextFromOCR"
Group="Demo"
Type="Note"
NumLines="10"
UnlimitedLengthInDocumentLibrary="TRUE"
Hidden="TRUE"
SourceID="http://schemas.microsoft.com/sharepoint/v3"
/>
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Microsoft.SharePoint.Client;
using OfficeDevPnP.Core;
using IronOcr;
namespace IronPdfTester
{
class Program
{
static string Url = "https://{any_tenant}.sharepoint.com/sites/ocr-tester";
static string ClientId = "{client_id_guid}";
static string ClientSecret = "{client_secret_string}";
static void Main(string[] args)
{
using (ClientContext ctx = new AuthenticationManager().GetAppOnlyAuthenticatedContext(Url, ClientId, ClientSecret))
{
Console.WriteLine(args[0]);
Web web = ctx.Web;
ctx.Load(web);
ctx.ExecuteQueryRetry();
Console.WriteLine(web.Title);
List list = web.Lists.GetByTitle("Documents");
ListItem item = list.GetItemById(args[0]);
ctx.Load(item);
ctx.ExecuteQueryRetry();
using (System.IO.MemoryStream ms = GetListItemFileStream(ctx, item))
{
string text = MakeOcr(ms);
item["DocumentTextFromOCR"] = text;
item.Update();
ctx.ExecuteQueryRetry();
}
}
}
static System.IO.MemoryStream GetListItemFileStream(ClientContext ctx, ListItem item)
{
System.IO.MemoryStream result = null;
try
{
File file = item.File;
ctx.Load(file);
ctx.ExecuteQueryRetry();
ClientResult<System.IO.Stream> stream = file.OpenBinaryStream();
ctx.ExecuteQueryRetry();
result = new System.IO.MemoryStream();
stream.Value.CopyTo(result);
result.Position = 0;
}
catch (Exception ex)
{
}
return result;
}
static string MakeOcr(System.IO.MemoryStream stream)
{
var Ocr = new AdvancedOcr()
{
CleanBackgroundNoise = false,
ColorDepth = 4,
ColorSpace = AdvancedOcr.OcrColorSpace.Color,
EnhanceContrast = false,
DetectWhiteTextOnDarkBackgrounds = false,
RotateAndStraighten = false,
// Language = IronOcr.Languages.English.OcrLanguagePack,
EnhanceResolution = false,
InputImageType = AdvancedOcr.InputTypes.Document,
ReadBarCodes = true,
Strategy = AdvancedOcr.OcrStrategy.Fast
};
var Results = Ocr.ReadPdf(stream);
var FullPdfText = Results.Text;
return FullPdfText;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment