Skip to content

Instantly share code, notes, and snippets.

@digioz
Last active August 29, 2015 14:24
Show Gist options
  • Save digioz/968d8e347c3b81a0120d to your computer and use it in GitHub Desktop.
Save digioz/968d8e347c3b81a0120d to your computer and use it in GitHub Desktop.
HtmlAgilityPackHelper is a static class which allows you to Parse Site HTML for Data
using System;
using System.Collections.Generic;
using HtmlAgilityPack;
using digioz.BO;
namespace digioz.BLL
{
public static class HtmlAgilityPackHelper
{
/// <summary>
/// Method to Get an HTML Document
/// either a remote one from a URL
/// or based on an HTML String Passed in
/// as a parameter to the function
/// </summary>
/// <param name="url"></param>
/// <param name="htmlString"></param>
/// <returns></returns>
public static HtmlDocument GetDocument(string url = "", string htmlString = "")
{
var getHtmlWeb = new HtmlWeb();
HtmlDocument document = new HtmlDocument();
if (htmlString == "")
{
document = getHtmlWeb.Load(url);
}
else
{
document.LoadHtml(htmlString);
}
return document;
}
/// <summary>
/// Method to get the InnerHTML of an HTML
/// Code Block based on the specified Id
/// </summary>
/// <param name="url"></param>
/// <param name="id"></param>
/// <returns></returns>
public static string GetHtmlById(string id, string url = "", string htmlString = "")
{
var getHtmlWeb = new HtmlWeb();
var document = GetDocument(url);
var innerHtml = document.GetElementbyId(id).InnerHtml;
return innerHtml.ToString();
}
/// <summary>
/// Method to retrieve a list of URLs
/// along with the Hyperlink Text based
/// on a pre-specified filtering criteria
/// </summary>
/// <param name="url"></param>
/// <param name="filter"></param>
/// <param name="htmlString"></param>
/// <returns></returns>
public static List<Link> GetLinks(string url = "", string filter = "", string htmlString = "")
{
List<Link> links = new List<Link>();
var getHtmlWeb = new HtmlWeb();
HtmlDocument document = null;
document = GetDocument(url, htmlString);
var aTags = document.DocumentNode.SelectNodes("//a");
if (aTags != null)
{
foreach (var aTag in aTags)
{
Link link = new Link
{
Text = aTag.InnerHtml,
Url = aTag.Attributes["href"].Value
};
if (filter.Length > 0)
{
if (link.Url.Contains(filter))
{
links.Add(link);
}
}
else
{
links.Add(link);
}
}
}
return links;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment