Last active
August 29, 2015 14:24
-
-
Save digioz/968d8e347c3b81a0120d to your computer and use it in GitHub Desktop.
HtmlAgilityPackHelper is a static class which allows you to Parse Site HTML for Data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using HtmlAgilityPack; | |
using digioz.BO; | |
namespace digioz.BLL | |
{ | |
public static class HtmlAgilityPackHelper | |
{ | |
/// <summary> | |
/// Method to Get an HTML Document | |
/// either a remote one from a URL | |
/// or based on an HTML String Passed in | |
/// as a parameter to the function | |
/// </summary> | |
/// <param name="url"></param> | |
/// <param name="htmlString"></param> | |
/// <returns></returns> | |
public static HtmlDocument GetDocument(string url = "", string htmlString = "") | |
{ | |
var getHtmlWeb = new HtmlWeb(); | |
HtmlDocument document = new HtmlDocument(); | |
if (htmlString == "") | |
{ | |
document = getHtmlWeb.Load(url); | |
} | |
else | |
{ | |
document.LoadHtml(htmlString); | |
} | |
return document; | |
} | |
/// <summary> | |
/// Method to get the InnerHTML of an HTML | |
/// Code Block based on the specified Id | |
/// </summary> | |
/// <param name="url"></param> | |
/// <param name="id"></param> | |
/// <returns></returns> | |
public static string GetHtmlById(string id, string url = "", string htmlString = "") | |
{ | |
var getHtmlWeb = new HtmlWeb(); | |
var document = GetDocument(url); | |
var innerHtml = document.GetElementbyId(id).InnerHtml; | |
return innerHtml.ToString(); | |
} | |
/// <summary> | |
/// Method to retrieve a list of URLs | |
/// along with the Hyperlink Text based | |
/// on a pre-specified filtering criteria | |
/// </summary> | |
/// <param name="url"></param> | |
/// <param name="filter"></param> | |
/// <param name="htmlString"></param> | |
/// <returns></returns> | |
public static List<Link> GetLinks(string url = "", string filter = "", string htmlString = "") | |
{ | |
List<Link> links = new List<Link>(); | |
var getHtmlWeb = new HtmlWeb(); | |
HtmlDocument document = null; | |
document = GetDocument(url, htmlString); | |
var aTags = document.DocumentNode.SelectNodes("//a"); | |
if (aTags != null) | |
{ | |
foreach (var aTag in aTags) | |
{ | |
Link link = new Link | |
{ | |
Text = aTag.InnerHtml, | |
Url = aTag.Attributes["href"].Value | |
}; | |
if (filter.Length > 0) | |
{ | |
if (link.Url.Contains(filter)) | |
{ | |
links.Add(link); | |
} | |
} | |
else | |
{ | |
links.Add(link); | |
} | |
} | |
} | |
return links; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Linq; | |
using System.Text; | |
using System.Threading.Tasks; | |
namespace digioz.BO | |
{ | |
public class Link | |
{ | |
public string Text { get; set; } | |
public string Url { get; set; } | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment