Skip to content

Instantly share code, notes, and snippets.

@seankearney
Last active November 12, 2016 02:08
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save seankearney/6941029 to your computer and use it in GitHub Desktop.
Save seankearney/6941029 to your computer and use it in GitHub Desktop.
Simple Sitecore/ASP.NET page to check what IFilters are known to the Sitecore system. It will also attempt to locate a media library item of each type and read text out of that file. This should help troubleshoot why some content inside of media library items are not being indexed.
<%@ Page Language="C#" %>
<script runat="server">
protected void Page_Load(object sender, EventArgs e)
{
// These are the extensions that Sitecore 7 supports indexing content for
// See: Sitecore.ContentSearch.ComputedFields.MediaItemContentExtractor
string[] extensions = new[]{ ".pdf",
".html",
".rtf",
".odt",
".doc",
".dot",
".docx",
".dotx",
".docm",
".dotm",
".xls",
".xlt",
".xla",
".xlsx",
".xltx",
".xlsm",
".xltm",
".xlam",
".xlsb",
".ppt",
".pot",
".pps",
".ppa",
".pptx",
".potx",
".ppsx",
".ppam",
".pptm",
".potm",
".ppsm"};
System.Collections.Generic.List<IFilterRegistration> registrations = GetRegistrations(extensions).ToList();
InstalledFilters.DataSource = registrations;
InstalledFilters.DataBind();
//Try the FilterReader class read a few characters
char[] chars = new char[10];
using (Sitecore.ContentSearch.Extracters.IFilterTextExtraction.FilterReader reader = new Sitecore.ContentSearch.Extracters.IFilterTextExtraction.FilterReader(Server.MapPath("~/temp/Word-2010.docx")))
{
reader.Read(chars, 0, 5);
}
Response.Write(string.Join("", chars));
}
protected System.Collections.Generic.IEnumerable<IFilterRegistration> GetRegistrations(params string[] extentions)
{
System.Type filterLoader = System.Type.GetType("Sitecore.ContentSearch.Extracters.IFilterTextExtraction.FilterLoader, Sitecore.ContentSearch");
// private static bool GetFilterDllAndClass(string ext, out string dllName, out string filterPersistClass)
System.Reflection.MethodInfo dynMethod = filterLoader.GetMethod("GetFilterDllAndClass", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static);
foreach (string extension in extentions)
{
string dllName = null;
string filterPersistClass = null;
object[] args = new object[] { extension, dllName, filterPersistClass };
dynMethod.Invoke(null, args);
System.Reflection.MethodInfo loadFilterFromDllMethod = filterLoader.GetMethod("LoadFilterFromDll", System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static);
Sitecore.ContentSearch.Extracters.IFilterTextExtraction.IFilter ifilter = loadFilterFromDllMethod.Invoke(null, new object[] { dllName, filterPersistClass }) as Sitecore.ContentSearch.Extracters.IFilterTextExtraction.IFilter;
if (ifilter == null)
{
throw new Exception("ifilter == null");
}
var extractor = new Sitecore.ContentSearch.ComputedFields.MediaItemIFilterTextExtractor();
if (extractor == null)
{
throw new Exception("Extractor is null!");
}
Sitecore.Data.Items.Item item = GetMediaItem(extension);
string content = "";
string sampleItemPath = "Item not found.";
if (item != null)
{
sampleItemPath = item.Paths.Path;
var indexableItem = (Sitecore.ContentSearch.SitecoreIndexableItem)item;
try
{
object value = extractor.ComputeFieldValue(indexableItem);
content = (value ?? "").ToString();
}
catch(Exception e)
{
content = string.Concat("Exception trying to execute: ", e.Message);
}
}
yield return new IFilterRegistration
{
Extension = extension,
DLL = (args[1] ?? "").ToString(),
FilterPersistClass = (args[2] ?? "").ToString(),
SampleItem = sampleItemPath,
SampleContent = content
};
}
}
// Looks in Sitecore for a media library item of a certain type
protected Sitecore.Data.Items.Item GetMediaItem(string extension)
{
extension = extension.TrimStart('.');
using (var context = Sitecore.ContentSearch.ContentSearchManager.GetIndex("sitecore_master_index").CreateSearchContext())
{
Sitecore.ContentSearch.SearchTypes.SearchResultItem sri = context.GetQueryable<Sitecore.ContentSearch.SearchTypes.SearchResultItem>()
//.Where(i => i.Path.StartsWith("/sitecore/media library", StringComparison.OrdinalIgnoreCase))
.Where(i => string.Equals(i["Extension"], extension, StringComparison.OrdinalIgnoreCase))
.FirstOrDefault();
if (sri != null)
{
return sri.GetItem();
}
}
return null;
}
protected class IFilterRegistration
{
public string Extension { get; set; }
public string DLL { get; set; }
public string FilterPersistClass { get; set; }
public string SampleItem { get; set; }
public string SampleContent { get; set; }
}
</script>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head runat="server">
<title></title>
</head>
<body>
<form id="form1" runat="server">
<div>
ContentSearchConfigurationSettings.MediaIndexingFolder = <%=Sitecore.ContentSearch.Utilities.ContentSearchConfigurationSettings.MediaIndexingFolder %>
<h1>Known IFilters</h1>
<table border="1">
<asp:Repeater ID="InstalledFilters" runat="server">
<ItemTemplate>
<tr>
<td><%#Eval("Extension") %></td>
<td><%#Eval("DLL") %></td>
<td><%#Eval("FilterPersistClass") %></td>
<td><%#Eval("SampleItem") %></td>
<td><%#Eval("SampleContent") %></td>
</tr>
</ItemTemplate>
</asp:Repeater>
</table>
</div>
</form>
</body>
</html>
@gitisz
Copy link

gitisz commented Apr 16, 2015

How are you instantiating Sitecore.ContentSearch.Extracters.IFilterTextExtraction.IFilter? This is a protected class in 7.5, also in 7.1. Thanks!

@zkniebel
Copy link

You need to break up the gist a little: make an aspx page that inherits a class with the contents of the <script> tag as it's body. Put the class in a project that compiles with the assembly name "Sitecore.ContentSearch.UnitTests". This way, the internal types will be visible to your project.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment