Created
February 5, 2016 05:19
-
-
Save Azadehkhojandi/a5913cca1412748e1f20 to your computer and use it in GitHub Desktop.
HtmlCrawledField computed field - extending sitecore indexing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//references https://github.com/hermanussen/sitecore-html-crawler | |
public class HtmlCrawledField : IComputedIndexField | |
{ | |
public string FieldName { get; set; } | |
public string ReturnType { get; set; } | |
public object ComputeFieldValue(IIndexable indexable) | |
{ | |
Assert.ArgumentNotNull(indexable, "indexable"); | |
string url = null; | |
try | |
{ | |
Item item = indexable as SitecoreIndexableItem; | |
//azadeh test | |
//if (baseItem.ID == ID.Parse("{6A139D21-0EBB-469C-94A8-FC1C40E45EB9}")) | |
//{ | |
// Debugger.Break(); | |
//} | |
// This field only works for items uder /sitecore/content that have a layout | |
if (item == null | |
|| item.Visualization.Layout == null | |
|| !item.Paths.FullPath.StartsWith( | |
Sitecore.Constants.ContentPath, | |
StringComparison.InvariantCultureIgnoreCase)) | |
{ | |
return null; | |
} | |
// Determine the url to request | |
using (new DatabaseSwitcher(item.Database)) | |
{ | |
url = WebUtil.AddQueryString( LinkManager.GetItemUrl(item,new UrlOptions(){AlwaysIncludeServerUrl = true}), | |
"sc_database", Sitecore.Context.Database.Name); | |
} | |
// Http request the page | |
using (var client = new WebClient()) | |
{ | |
var pageContent = client.DownloadString(url); | |
// Parse the page's html using HtmlAgilityPack | |
var htmlDocument = new HtmlDocument(); | |
htmlDocument.LoadHtml(pageContent); | |
// Strip out all the html tags, so we can index just the text | |
var mainContainer = htmlDocument.DocumentNode.Descendants("body").FirstOrDefault(); | |
var content = mainContainer != null ? GetAllInnerTexts(mainContainer) : null; | |
return content; | |
} | |
} | |
catch (WebException ex) | |
{ | |
Log.Warn($"Failed to html index {indexable.Id} ({url}): {ex.Message}", ex, this); | |
} | |
catch (Exception ex) | |
{ | |
Log.Error($"An error occurred when indexing {indexable.Id}: {ex.Message}", ex, this); | |
} | |
return null; | |
} | |
/// <summary> | |
/// Find all inner texts and return a simplified string. | |
/// </summary> | |
/// <param name="node"></param> | |
/// <returns></returns> | |
protected virtual string GetAllInnerTexts(HtmlNode node) | |
{ | |
return RemoveWhitespace(string.Join(" ", node.DescendantsAndSelf() | |
.Select(d => d.InnerText.Replace(Environment.NewLine, " ")))).Trim().ToLowerInvariant(); | |
} | |
//TODO [Aza] :maybe use regex to do that | |
/// <summary> | |
/// Storing whitespace for a field that is only to be used for searching in is not very useful. | |
/// This methods removes excessive whitespace. | |
/// </summary> | |
/// <param name="inputStr"></param> | |
/// <returns></returns> | |
private static string RemoveWhitespace(string inputStr) | |
{ | |
const int n = 5; | |
StringBuilder tmpbuilder = new StringBuilder(inputStr.Length); | |
for (int i = 0; i < n; ++i) | |
{ | |
string scopy = inputStr; | |
bool inspaces = false; | |
tmpbuilder.Length = 0; | |
for (int k = 0; k < inputStr.Length; ++k) | |
{ | |
char c = scopy[k]; | |
if (inspaces) | |
{ | |
if (c != ' ') | |
{ | |
inspaces = false; | |
tmpbuilder.Append(c); | |
} | |
} | |
else if (c == ' ') | |
{ | |
inspaces = true; | |
tmpbuilder.Append(' '); | |
} | |
else | |
{ | |
tmpbuilder.Append(c); | |
} | |
} | |
} | |
return tmpbuilder.ToString(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment