Skip to content

Instantly share code, notes, and snippets.

@michaelrinderle
Last active August 1, 2021 22:48
Show Gist options
  • Save michaelrinderle/008d151774a228bd534eab6e3478c67a to your computer and use it in GitHub Desktop.
Save michaelrinderle/008d151774a228bd534eab6e3478c67a to your computer and use it in GitHub Desktop.
Html Page Dom Parse
using System;
namespace HtmlParser
{
class Program
{
static void Main(string[] args)
{
try
{
using var parser = new DomParser("https://www.michaelrinderle.com");
PageDom dom = parser.GetPageDom();
}
catch (Exception ex)
{
Console.WriteLine(ex);
}
}
}
}
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text;
namespace HtmlParser
{
public class DomParser : IDisposable
{
string Url;
HtmlDocument Document;
PageDom Dom;
private bool _disposed;
public DomParser(string url)
{
this.Load(url, string.Empty);
}
public DomParser(string url, string pageSource)
{
this.Load(url, pageSource);
}
public void Load(string url, string pageSource)
{
this.Url = url;
this.Document = new();
if (string.IsNullOrEmpty(pageSource))
{
try
{
var request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "GET";
using var response = (HttpWebResponse)request.GetResponse();
using var stream = response.GetResponseStream();
this.Document.Load(stream, Encoding.GetEncoding("iso-8859-9"));
}
catch
{
throw new Exception("Cannot retrieve page source.");
}
}
else
this.Document.LoadHtml(pageSource);
if (this.Document.ParseErrors == null)
throw new Exception("Invalid Page Source");
}
public PageDom GetPageDom()
{
this.Dom = new(this.Url);
this.Dom.Domain = new Uri(this.Url).Host;
this.ParseMetaTags();
this.ParseHeaderTags();
this.ParsePTags();
this.ParseImgUrls();
this.ParseUrls();
return this.Dom;
}
private void ParseMetaTags()
{
this.Dom.Title = this.Document.DocumentNode
.Descendants("title").SingleOrDefault().InnerText;
this.Dom.Language = this.Document.DocumentNode
.SelectSingleNode("//meta[@name='language']").Attributes["content"].Value;
this.Dom.Author = this.Document.DocumentNode
.SelectSingleNode("//meta[@name='author']").Attributes["content"].Value;
this.Dom.Description = this.Document.DocumentNode
.SelectSingleNode("//meta[@name='description']").Attributes["content"].Value;
this.Dom.Keywords = this.Document.DocumentNode
.SelectSingleNode("//meta[@name='keywords']").Attributes["content"].Value;
this.Dom.Robots = this.Document.DocumentNode
.SelectSingleNode("//meta[@name='robots']").Attributes["content"].Value;
}
private void ParseHeaderTags()
{
this.Dom.H1 = this.Document.DocumentNode
.Descendants("h1").Select(x => x.InnerText).ToList();
this.Dom.H2 = this.Document.DocumentNode
.Descendants("h2").Select(x => x.InnerText).ToList();
this.Dom.H3 = this.Document.DocumentNode
.Descendants("h3").Select(x => x.InnerText).ToList();
this.Dom.H4 = this.Document.DocumentNode
.Descendants("h4").Select(x => x.InnerText).ToList();
this.Dom.H5 = this.Document.DocumentNode
.Descendants("h5").Select(x => x.InnerText).ToList();
this.Dom.H6 = this.Document.DocumentNode
.Descendants("h6").Select(x => x.InnerText).ToList();
}
private void ParsePTags()
{
this.Dom.P = this.Document.DocumentNode
.Descendants("p").Select(x => x.InnerText).ToList();
}
private void ParseImgUrls()
{
List<string> imageUrls = this.Document.DocumentNode
.Descendants("img").Select(x => x.Attributes["src"].Value).ToList();
for (int i = imageUrls.Count - 1; i >= 0; i--)
{
// remove embedded image
if(imageUrls[i].Contains("data:image"))
{
imageUrls.RemoveAt(i);
continue;
}
// check for absolute path
Uri result;
Uri.TryCreate(imageUrls[i], UriKind.Absolute, out result);
// try to make absolute path
if (result == null)
Uri.TryCreate(new Uri(new Uri(this.Dom.Domain), imageUrls[i]).AbsolutePath, UriKind.Absolute, out result);
// update if successful
if (result != null)
{
imageUrls[i] = result.AbsoluteUri.Replace("file", "http");
}
else imageUrls.RemoveAt(i);
}
this.Dom.ImageUrls = imageUrls;
}
private void ParseUrls()
{
List<string> blacklist = new() { "mailto:", "javascript(0);", ".pdf", ".zip", ".doc"};
List<string> urls = new();
foreach (var link in this.Document.DocumentNode.SelectNodes("//a[@href]"))
{
try
{
urls.Add(link.GetAttributeValue("href", string.Empty));
}
catch { continue; }
}
// remove bad paths and fix/add relative paths
for (int i = urls.Count - 1; i >= 0; i--)
{
if (string.IsNullOrEmpty(urls[i]))
{
urls.RemoveAt(i);
continue;
}
// blacklist keywords
if (blacklist.Any(x => urls[i].Contains(x)))
{
urls.RemoveAt(i);
continue;
}
// check for absolute path
Uri result;
Uri.TryCreate(urls[i], UriKind.Absolute, out result);
// try to make absolute path
if (result == null)
Uri.TryCreate(new Uri(new Uri(this.Dom.Domain), urls[i]).AbsoluteUri, UriKind.Absolute, out result);
// update if successful
if (result != null)
{
urls[i] = result.AbsoluteUri;
}
else urls.RemoveAt(i);
}
// parse queries
this.Dom.QueryUrls = new();
for (int i = urls.Count - 1; i >= 0; i--)
{
if (urls[i].Contains("?"))
{
this.Dom.QueryUrls.Add(new Uri(urls[i]).Query);
urls.RemoveAt(i);
}
}
this.Dom.InternalUrls = new();
this.Dom.ExternalUrls = new();
// sort internal / external links
for (int i = urls.Count - 1; i >= 0; i--)
{
if (!urls[i].Contains(this.Dom.Domain))
this.Dom.ExternalUrls.Add(urls[i]);
else
this.Dom.InternalUrls.Add(urls[i]);
urls.RemoveAt(i);
}
}
public void Dispose()
{
if (!_disposed)
{
_disposed = true;
this.Dom = null;
this.Document = null;
GC.SuppressFinalize(this);
}
}
}
}
using System.Collections.Generic;
namespace HtmlParser
{
public class PageDom
{
public PageDom(string url)
{
this.Url = url;
}
// General Dom Info
public string Url { get; set; }
public string Domain { get; set; }
// Page Meta Elements
public string Title { get; set; }
public string Language { get; set; }
public string Author { get; set; }
public string Description { get; set; }
public string Keywords { get; set; }
public string Robots { get; set; }
// Page Element Lists
public List<string> H1 { get; set; }
public List<string> H2 { get; set; }
public List<string> H3 { get; set; }
public List<string> H4 { get; set; }
public List<string> H5 { get; set; }
public List<string> H6 { get; set; }
public List<string> P { get; set; }
// Page Urls
public List<string> ImageUrls { get; set; }
public List<string> InternalUrls { get; set; }
public List<string> ExternalUrls { get; set; }
public List<string> QueryUrls { get; set; }
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment