Skip to content

Instantly share code, notes, and snippets.

@Azadehkhojandi
Created February 5, 2016 05:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Azadehkhojandi/a5913cca1412748e1f20 to your computer and use it in GitHub Desktop.
Save Azadehkhojandi/a5913cca1412748e1f20 to your computer and use it in GitHub Desktop.
HtmlCrawledField computed field - extending sitecore indexing
//references https://github.com/hermanussen/sitecore-html-crawler
public class HtmlCrawledField : IComputedIndexField
{
public string FieldName { get; set; }
public string ReturnType { get; set; }
public object ComputeFieldValue(IIndexable indexable)
{
Assert.ArgumentNotNull(indexable, "indexable");
string url = null;
try
{
Item item = indexable as SitecoreIndexableItem;
//azadeh test
//if (baseItem.ID == ID.Parse("{6A139D21-0EBB-469C-94A8-FC1C40E45EB9}"))
//{
// Debugger.Break();
//}
// This field only works for items uder /sitecore/content that have a layout
if (item == null
|| item.Visualization.Layout == null
|| !item.Paths.FullPath.StartsWith(
Sitecore.Constants.ContentPath,
StringComparison.InvariantCultureIgnoreCase))
{
return null;
}
// Determine the url to request
using (new DatabaseSwitcher(item.Database))
{
url = WebUtil.AddQueryString( LinkManager.GetItemUrl(item,new UrlOptions(){AlwaysIncludeServerUrl = true}),
"sc_database", Sitecore.Context.Database.Name);
}
// Http request the page
using (var client = new WebClient())
{
var pageContent = client.DownloadString(url);
// Parse the page's html using HtmlAgilityPack
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(pageContent);
// Strip out all the html tags, so we can index just the text
var mainContainer = htmlDocument.DocumentNode.Descendants("body").FirstOrDefault();
var content = mainContainer != null ? GetAllInnerTexts(mainContainer) : null;
return content;
}
}
catch (WebException ex)
{
Log.Warn($"Failed to html index {indexable.Id} ({url}): {ex.Message}", ex, this);
}
catch (Exception ex)
{
Log.Error($"An error occurred when indexing {indexable.Id}: {ex.Message}", ex, this);
}
return null;
}
/// <summary>
/// Find all inner texts and return a simplified string.
/// </summary>
/// <param name="node"></param>
/// <returns></returns>
protected virtual string GetAllInnerTexts(HtmlNode node)
{
return RemoveWhitespace(string.Join(" ", node.DescendantsAndSelf()
.Select(d => d.InnerText.Replace(Environment.NewLine, " ")))).Trim().ToLowerInvariant();
}
//TODO [Aza] :maybe use regex to do that
/// <summary>
/// Storing whitespace for a field that is only to be used for searching in is not very useful.
/// This methods removes excessive whitespace.
/// </summary>
/// <param name="inputStr"></param>
/// <returns></returns>
private static string RemoveWhitespace(string inputStr)
{
const int n = 5;
StringBuilder tmpbuilder = new StringBuilder(inputStr.Length);
for (int i = 0; i < n; ++i)
{
string scopy = inputStr;
bool inspaces = false;
tmpbuilder.Length = 0;
for (int k = 0; k < inputStr.Length; ++k)
{
char c = scopy[k];
if (inspaces)
{
if (c != ' ')
{
inspaces = false;
tmpbuilder.Append(c);
}
}
else if (c == ' ')
{
inspaces = true;
tmpbuilder.Append(' ');
}
else
{
tmpbuilder.Append(c);
}
}
}
return tmpbuilder.ToString();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment