Azadehkhojandi/HtmlCrawledField.cs

## HtmlCrawledField.cs
//references https://github.com/hermanussen/sitecore-html-crawler
public class HtmlCrawledField : IComputedIndexField
    {
        public string FieldName { get; set; }

        public string ReturnType { get; set; }

        public object ComputeFieldValue(IIndexable indexable)
        {
            Assert.ArgumentNotNull(indexable, "indexable");
            string url = null;
            try
            {
                Item item = indexable as SitecoreIndexableItem;

                //azadeh test
                //if (baseItem.ID == ID.Parse("{6A139D21-0EBB-469C-94A8-FC1C40E45EB9}"))
                //{
                //    Debugger.Break();
                //}

                // This field only works for items uder /sitecore/content that have a layout
                if (item == null
                    || item.Visualization.Layout == null
                    || !item.Paths.FullPath.StartsWith(
                            Sitecore.Constants.ContentPath,
                            StringComparison.InvariantCultureIgnoreCase))
                {
                    return null;
                }

                // Determine the url to request
                using (new DatabaseSwitcher(item.Database))
                {
                    url = WebUtil.AddQueryString( LinkManager.GetItemUrl(item,new UrlOptions(){AlwaysIncludeServerUrl = true}),
                        "sc_database", Sitecore.Context.Database.Name);
                }

                // Http request the page
                using (var client = new WebClient())
                {
                    var pageContent = client.DownloadString(url);

                    // Parse the page's html using HtmlAgilityPack
                    var htmlDocument = new HtmlDocument();
                    htmlDocument.LoadHtml(pageContent);

                    // Strip out all the html tags, so we can index just the text
                    var mainContainer = htmlDocument.DocumentNode.Descendants("body").FirstOrDefault();
                    var content = mainContainer != null ? GetAllInnerTexts(mainContainer) : null;
                    return content;
                }
            }
            catch (WebException ex)
            {
                Log.Warn($"Failed to html index {indexable.Id} ({url}): {ex.Message}", ex, this);
            }
            catch (Exception ex)
            {
                Log.Error($"An error occurred when indexing {indexable.Id}: {ex.Message}", ex, this);
            }
            return null;
        }

        /// <summary>
        /// Find all inner texts and return a simplified string.
        /// </summary>
        /// <param name="node"></param>
        /// <returns></returns>
        protected virtual string GetAllInnerTexts(HtmlNode node)
        {
            return RemoveWhitespace(string.Join(" ", node.DescendantsAndSelf()
                    .Select(d => d.InnerText.Replace(Environment.NewLine, " ")))).Trim().ToLowerInvariant();
        }


        //TODO [Aza] :maybe use regex to do that
        /// <summary>
        /// Storing whitespace for a field that is only to be used for searching in is not very useful.
        /// This methods removes excessive whitespace.
        /// </summary>
        /// <param name="inputStr"></param>
        /// <returns></returns>
        private static string RemoveWhitespace(string inputStr)
        {
            const int n = 5;
            StringBuilder tmpbuilder = new StringBuilder(inputStr.Length);
            for (int i = 0; i < n; ++i)
            {
                string scopy = inputStr;
                bool inspaces = false;
                tmpbuilder.Length = 0;
                for (int k = 0; k < inputStr.Length; ++k)
                {
                    char c = scopy[k];
                    if (inspaces)
                    {
                        if (c != ' ')
                        {
                            inspaces = false;
                            tmpbuilder.Append(c);
                        }
                    }
                    else if (c == ' ')
                    {
                        inspaces = true;
                        tmpbuilder.Append(' ');
                    }
                    else
                    {
                        tmpbuilder.Append(c);
                    }
                }
            }
            return tmpbuilder.ToString();
        }
	//references https://github.com/hermanussen/sitecore-html-crawler
	public class HtmlCrawledField : IComputedIndexField
	{
	public string FieldName { get; set; }

	public string ReturnType { get; set; }

	public object ComputeFieldValue(IIndexable indexable)
	{
	Assert.ArgumentNotNull(indexable, "indexable");
	string url = null;
	try
	{
	Item item = indexable as SitecoreIndexableItem;

	//azadeh test
	//if (baseItem.ID == ID.Parse("{6A139D21-0EBB-469C-94A8-FC1C40E45EB9}"))
	//{
	// Debugger.Break();
	//}

	// This field only works for items uder /sitecore/content that have a layout
	if (item == null
	\|\| item.Visualization.Layout == null
	\|\| !item.Paths.FullPath.StartsWith(
	Sitecore.Constants.ContentPath,
	StringComparison.InvariantCultureIgnoreCase))
	{
	return null;
	}

	// Determine the url to request
	using (new DatabaseSwitcher(item.Database))
	{
	url = WebUtil.AddQueryString( LinkManager.GetItemUrl(item,new UrlOptions(){AlwaysIncludeServerUrl = true}),
	"sc_database", Sitecore.Context.Database.Name);
	}

	// Http request the page
	using (var client = new WebClient())
	{
	var pageContent = client.DownloadString(url);

	// Parse the page's html using HtmlAgilityPack
	var htmlDocument = new HtmlDocument();
	htmlDocument.LoadHtml(pageContent);

	// Strip out all the html tags, so we can index just the text
	var mainContainer = htmlDocument.DocumentNode.Descendants("body").FirstOrDefault();
	var content = mainContainer != null ? GetAllInnerTexts(mainContainer) : null;
	return content;
	}
	}
	catch (WebException ex)
	{
	Log.Warn($"Failed to html index {indexable.Id} ({url}): {ex.Message}", ex, this);
	}
	catch (Exception ex)
	{
	Log.Error($"An error occurred when indexing {indexable.Id}: {ex.Message}", ex, this);
	}
	return null;
	}

	/// <summary>
	/// Find all inner texts and return a simplified string.
	/// </summary>
	/// <param name="node"></param>
	/// <returns></returns>
	protected virtual string GetAllInnerTexts(HtmlNode node)
	{
	return RemoveWhitespace(string.Join(" ", node.DescendantsAndSelf()
	.Select(d => d.InnerText.Replace(Environment.NewLine, " ")))).Trim().ToLowerInvariant();
	}


	//TODO [Aza] :maybe use regex to do that
	/// <summary>
	/// Storing whitespace for a field that is only to be used for searching in is not very useful.
	/// This methods removes excessive whitespace.
	/// </summary>
	/// <param name="inputStr"></param>
	/// <returns></returns>
	private static string RemoveWhitespace(string inputStr)
	{
	const int n = 5;
	StringBuilder tmpbuilder = new StringBuilder(inputStr.Length);
	for (int i = 0; i < n; ++i)
	{
	string scopy = inputStr;
	bool inspaces = false;
	tmpbuilder.Length = 0;
	for (int k = 0; k < inputStr.Length; ++k)
	{
	char c = scopy[k];
	if (inspaces)
	{
	if (c != ' ')
	{
	inspaces = false;
	tmpbuilder.Append(c);
	}
	}
	else if (c == ' ')
	{
	inspaces = true;
	tmpbuilder.Append(' ');
	}
	else
	{
	tmpbuilder.Append(c);
	}
	}
	}
	return tmpbuilder.ToString();
	}