Skip to content

Instantly share code, notes, and snippets.

@biapar
Forked from DavidVeksler/ImportController.cs
Created October 22, 2017 19:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save biapar/5cb02a0b97f7e3b0a9f66d9e73c61019 to your computer and use it in GitHub Desktop.
Save biapar/5cb02a0b97f7e3b0a9f66d9e73c61019 to your computer and use it in GitHub Desktop.
How FEE digitized and shared 70 years of archives on the Web: an Umbraco case study
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Web.Mvc;
using Archive.FEE.Web.Helper.PDFParser;
using Umbraco.Core;
using Umbraco.Core.Logging;
using Umbraco.Core.Models;
using Umbraco.Core.Services;
using Umbraco.Web;
using Umbraco.Web.Mvc;
namespace Archive.FEE.Web.App_Code.Controllers
{
public class ImportController : SurfaceController
{
private static readonly IContentService ContentService = ApplicationContext.Current.Services.ContentService;
private IEnumerable<IPublishedContent> documents;
StringBuilder output = new StringBuilder();
// GET: Import
// http://local.history.fee.org/umbraco/surface/import/index
public ActionResult Index()
{
//var mediaNode = 1087;
//var contentNode = 4822; // correspondence
var mediaNode = int.Parse(Request.QueryString["mediaid"]);
var contentNode = int.Parse(Request.QueryString["contentid"]); // correspondence
AddDocumentsForSpecifiedMediaFolder(mediaNode, contentNode);
return Content(string.Format("<pre>{0}</pre>", output.ToString()));
}
private void AddDocumentsForSpecifiedMediaFolder(int mediaNode, int contentNode)
{
var mediaFiles = Umbraco.TypedMedia(mediaNode).Children;
output.AppendLine("media " + mediaFiles.Count() + Environment.NewLine);
documents = Umbraco.TypedContent(contentNode).Children;
output.AppendLine("content " + documents.Count() + Environment.NewLine);
mediaFiles.Where(m => m.DocumentTypeAlias == "Folder").ForEach(folder =>
{
Debug.WriteLine(folder.Name);
if (!documents.Any(d => d.Name == folder.Name))
{
// create new folder:
var category = ContentService.CreateContent(folder.Name, contentNode, "category");
if (ContentService.SaveAndPublishWithStatus(category))
{
Debug.WriteLine("Add media in " + folder.Name);
output.AppendLine("Add media in " + folder.Name);
AddDocumentsForSpecifiedMediaFolder(folder.Id, category.Id);
}
}
else
{
var category = documents.FirstOrDefault(d => d.Name == folder.Name);
output.AppendLine("Add media in " + folder.Name);
AddDocumentsForSpecifiedMediaFolder(folder.Id, category.Id);
}
});
mediaFiles.Where(m => m.DocumentTypeAlias != "Folder").Reverse().ForEach(file =>
{
Debug.WriteLine(file.Name);
output.AppendLine(file.Name);
output.AppendLine(CreateContentForMediaFile(contentNode, file));
});
}
private string CreateContentForMediaFile(int parentNodeId, IPublishedContent file)
{
IContent document = null;
try
{
var pdfFile = file.Url;
var meta = PDFParser.GetFileMetadata(pdfFile);
// check if this document was already added:
if (documents.Any(d => d.GetPropertyValue<string>("docReferenceNumber") == meta.DocReferenceNumber.ToString()))
{
Debug.WriteLine("document already exists");
return "document already exists: " + file.Name;
}
document = SetDocumentProperties(parentNodeId, file, meta);
ContentService.SaveAndPublishWithStatus(document, 0, false);
Debug.WriteLine("saved" + meta.Title);
LogHelper.Info<ImportController>("created " + meta.Title);
return "saved" + meta.Title + Environment.NewLine;
}
catch (Exception ex)
{
Debug.WriteLine(ex);
LogHelper.Error<ImportController>("error parsing PDF: ", ex);
try
{
ContentService.Delete(document);
}
catch (Exception ex2)
{
Debug.WriteLine(ex2);
}
return "no file added:" + ex;
}
}
private static IContent SetDocumentProperties(int parentNodeId, IPublishedContent file, PDFFileMeta meta)
{
var document = ContentService.CreateContent(meta.Title, parentNodeId, "Document");
document.SetValue("Date", meta.DateOfPublication != DateTime.MinValue ? meta.DateOfPublication : meta.Created);
if (meta.Authors != null)
{
// if correspondence:
if (parentNodeId == 4822)
{
document.SetValue("correspondenceFrom", meta.Authors?[0]);
if (meta.Authors.Length > 1)
{
document.SetValue("correspondenceTo", meta.Authors[1]);
}
}
else
{
document.SetValue("author", meta.Authors[0]);
if (meta.Authors.Length > 1) document.SetValue("author2", meta.Authors[1]);
if (meta.Authors.Length > 2) document.SetValue("author3", meta.Authors[2]);
}
}
document.SetValue("originalFile", meta.OriginalFileName);
document.SetValue("docReferenceNumber", meta.DocReferenceNumber);
document.SetValue("mainDocument", file.Id);
document.SetValue("documentHTML", meta.DocumentHTML);
document.SetValue("publisher", meta.Publisher);
document.SetValue("Comments", meta.Comments);
document.SetValue("categoryValue", meta.Category);
document.SetValue("typeOfDocument", meta.CategorySelect());
document.CreateDate = meta.Created;
document.UpdateDate = meta.Modified;
return document;
}
}
}
using System;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Net;
using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using Umbraco.Core;
using Umbraco.Core.IO;
namespace Archive.FEE.Web.Helper.PDFParser
{
public class PDFProperties
{
public const string Title = "Title";
public const string TypeofDocument = "Category";
public const string @Date = "Date of Publication";
public const string Author = "Author";
public const string OriginalFile = "Original File";
public const string Publisher = "Publisher";
public const string Comments = "Comments";
public static string Category = "Category";
}
public class PDFFileMeta
{
public string[] Authors;
public DateTime DateOfPublication;
public string Subject;
public string Title;
public string Publisher { get; set; }
public int DocReferenceNumber { get; set; }
public string Comments { get; set; }
public string DocumentHTML { get; set; }
public DateTime Created { get; set; }
public string OriginalFileName { get; set; }
public DateTime Modified { get; set; }
public string Category { get; set; }
public int CategorySelect()
{
switch (Category)
{
case "Correspondence":
return 0;
break;
case "FEE_Publication":
return 1;
break;
case "Personal Files":
return 2;
break;
case "Leonard Read Journal":
case "Leonard E. Read Journal":
return 3;
break;
case "Non-FEE Publication":
return 4;
break;
default:
return -1;
}
}
}
public static class PDFParser
{
private const string BaseUrl = "http://history.fee.org";
private static readonly MediaFileSystem Media =
FileSystemProviderManager.Current.GetFileSystemProvider<MediaFileSystem>();
public static PDFFileMeta GetFileMetadata(string url)
{
if (string.IsNullOrWhiteSpace(url))
{
throw new ArgumentNullException("file URL is missing");
}
var filePath = Media.GetFullPath(url);
if (!File.Exists(filePath))
{
throw new Exception("File does not exist:" + filePath);
//var fileUrl = BaseUrl + url;
//var directory = Path.GetDirectoryName(filePath);
//Directory.CreateDirectory(directory);
//new WebClient().DownloadFile(fileUrl, filePath);
}
ReadPDFInfo(filePath);
var reader = new MetaDataReader(filePath);
var fileInfo = new FileInfo(filePath);
// TODO:
var meta = new PDFFileMeta
{
Title = reader.ReadEntry(PDFProperties.Title),
Authors = reader.ReadEntry(PDFProperties.Author)?.Split(','),
Publisher = reader.ReadEntry(PDFProperties.Publisher),
Comments = reader.ReadEntry(PDFProperties.Comments),
Category = reader.ReadEntry(PDFProperties.Category),
DocumentHTML = GetTextFromAllPages(filePath)?.Truncate(100000),
Created = reader.Created(),
Modified = reader.Modified(),
OriginalFileName = fileInfo.Name
};
int refNum;
int.TryParse(fileInfo.Name.Split('-').FirstOrDefault(), out refNum);
meta.DocReferenceNumber = refNum;
DateTime dop;
if (DateTime.TryParse(reader.ReadEntry(PDFProperties.Date), out dop))
{
meta.DateOfPublication = dop;
}
if (string.IsNullOrWhiteSpace(meta.Title))
{
throw new ArgumentNullException("Title for " + filePath);
}
return meta;
}
public static void ReadPDFInfo(string path)
{
var reader = new PdfReader(path);
foreach (var b in reader.Info)
{
Debug.WriteLine(b.Key + ": " + b.Value);
}
}
public static string GetTextFromAllPages(string pdfPath)
{
PdfReader reader = new PdfReader(pdfPath);
StringWriter output = new StringWriter();
for (int i = 1; i <= reader.NumberOfPages; i++)
output.WriteLine(PdfTextExtractor.GetTextFromPage(reader, i, new SimpleTextExtractionStrategy()));
return output.ToString();
}
}
internal class MetaDataReader
{
private readonly PdfReader _reader;
public MetaDataReader(string pdfPath)
{
_reader = new PdfReader(pdfPath);
}
public string ReadEntry(string key)
{
string value;
_reader.Info.TryGetValue(key, out value);
if (string.IsNullOrWhiteSpace(value)) // bad data
{
_reader.Info.TryGetValue(key + " ", out value);
}
return value;
}
public DateTime Created()
{
string creationDate;
_reader.Info.TryGetValue("CreationDate", out creationDate);
return ParsePDFDate(creationDate);
}
private DateTime ParsePDFDate(string date)
{
if (string.IsNullOrWhiteSpace(date))
{
return DateTime.Now;
}
System.Globalization.CultureInfo provider = System.Globalization.CultureInfo.InvariantCulture;
try
{
return date.EndsWith("Z") ? DateTime.ParseExact(date, "D:yyyyMMddHHmmssZ", provider) : DateTime.ParseExact(date.Split('-').First(), "D:yyyyMMddHHmmss", provider);
}
catch
{
try
{
return DateTime.ParseExact(date, "D:yyyyMMddHHmmss", provider);
}
catch
{
try
{
return DateTime.ParseExact(date, "D:yyyyMMddHHmmssZ", provider);
}
catch
{
return DateTime.Now;
}
}
}
}
public DateTime Modified()
{
string modDate;
_reader.Info.TryGetValue("ModDate", out modDate);
return ParsePDFDate(modDate);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment