Skip to content

Instantly share code, notes, and snippets.

@lars-erik
Created May 8, 2020 17:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lars-erik/ddc6a5c03878b293650d54212b3c8c4b to your computer and use it in GitHub Desktop.
Save lars-erik/ddc6a5c03878b293650d54212b3c8c4b to your computer and use it in GitHub Desktop.
Old fashion free pdf indexing with Umbraco
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Threading.Tasks;
using System.Xml.Linq;
using Examine;
using Examine.LuceneEngine.Providers;
using Lucene.Net.Analysis;
using Microsoft.WindowsAzure.Storage.Auth;
using Microsoft.WindowsAzure.Storage.Blob;
using MyCustomer.Core.Infrastructure;
using MyCustomer.Core.Magazine;
using MyCustomer.Core.Queries;
using Directory = Lucene.Net.Store.Directory;
namespace MySite.Backoffice.Magazine
{
public class MagazineIndexer : LuceneIndexer
{
private readonly Directory directory;
private readonly IQueryHandler<MagazineQuery> queryHandler;
private CloudBlobClient cloudBlobClient = new CloudBlobClient(
new Uri("https://mycustomer.blob.core.windows.net"),
new StorageCredentials(
"storagename",
"secret"
)
);
private PDFParser pdfParser = new PDFParser();
public MagazineIndexer()
{
queryHandler = Container.Instance.Resolve<IQueryHandler<MagazineQuery>>();
}
public MagazineIndexer(Directory directory, IQueryHandler<MagazineQuery> queryHandler, Analyzer analyzer, bool async)
: base(
new IndexCriteria(
Enumerable.Empty<IIndexField>(),
Enumerable.Empty<IIndexField>(),
Enumerable.Empty<string>(),
Enumerable.Empty<string>(),
null
),
directory,
analyzer,
async
)
{
this.directory = directory;
this.queryHandler = queryHandler;
}
protected override void PerformIndexAll(string type)
{
var result = (IEnumerable<Magazine>)Task.Run(async () => await queryHandler.Execute(new MagazineQuery())).Result;
var nodes = result.Select(x =>
{
var xElement = new XElement("url", x.PDF);
xElement.Add(new XAttribute("id", Convert.ToInt32(x.Year) * 100 + Convert.ToInt32(x.Month)));
return xElement;
});
AddNodesToIndex(nodes, "");
}
protected override void PerformIndexRebuild()
{
IndexAll("");
}
protected override Dictionary<string, string> GetDataToIndex(XElement node, string type)
{
try
{
var pdf = cloudBlobClient.GetBlobReferenceFromServer(new Uri("https://mycustomer.blob.core.windows.net" + node.Value));
using (var stream = new MemoryStream())
{
pdf.DownloadToStream(stream);
stream.Seek(0, SeekOrigin.Begin);
var text = pdfParser.GetTextFromAllPages(stream, (e) => { });
var value = node.Attribute("id").Value;
var edition = value.Substring(4, 2) + " / " + value.Substring(0, 4);
return new Dictionary<string, string>
{
{"type", "magasin" },
{"edition", edition },
{"url", node.Value },
{"body", text}
};
}
}
catch
{
return new Dictionary<string, string>();
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment