Skip to content

Instantly share code, notes, and snippets.

Last active August 1, 2021 22:48
Show Gist options
  • Save michaelrinderle/008d151774a228bd534eab6e3478c67a to your computer and use it in GitHub Desktop.
Save michaelrinderle/008d151774a228bd534eab6e3478c67a to your computer and use it in GitHub Desktop.
Html Page Dom Parse
using System;
namespace HtmlParser
class Program
static void Main(string[] args)
using var parser = new DomParser("");
PageDom dom = parser.GetPageDom();
catch (Exception ex)
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text;
namespace HtmlParser
public class DomParser : IDisposable
string Url;
HtmlDocument Document;
PageDom Dom;
private bool _disposed;
public DomParser(string url)
this.Load(url, string.Empty);
public DomParser(string url, string pageSource)
this.Load(url, pageSource);
public void Load(string url, string pageSource)
this.Url = url;
this.Document = new();
if (string.IsNullOrEmpty(pageSource))
var request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "GET";
using var response = (HttpWebResponse)request.GetResponse();
using var stream = response.GetResponseStream();
this.Document.Load(stream, Encoding.GetEncoding("iso-8859-9"));
throw new Exception("Cannot retrieve page source.");
if (this.Document.ParseErrors == null)
throw new Exception("Invalid Page Source");
public PageDom GetPageDom()
this.Dom = new(this.Url);
this.Dom.Domain = new Uri(this.Url).Host;
return this.Dom;
private void ParseMetaTags()
this.Dom.Title = this.Document.DocumentNode
this.Dom.Language = this.Document.DocumentNode
this.Dom.Author = this.Document.DocumentNode
this.Dom.Description = this.Document.DocumentNode
this.Dom.Keywords = this.Document.DocumentNode
this.Dom.Robots = this.Document.DocumentNode
private void ParseHeaderTags()
this.Dom.H1 = this.Document.DocumentNode
.Descendants("h1").Select(x => x.InnerText).ToList();
this.Dom.H2 = this.Document.DocumentNode
.Descendants("h2").Select(x => x.InnerText).ToList();
this.Dom.H3 = this.Document.DocumentNode
.Descendants("h3").Select(x => x.InnerText).ToList();
this.Dom.H4 = this.Document.DocumentNode
.Descendants("h4").Select(x => x.InnerText).ToList();
this.Dom.H5 = this.Document.DocumentNode
.Descendants("h5").Select(x => x.InnerText).ToList();
this.Dom.H6 = this.Document.DocumentNode
.Descendants("h6").Select(x => x.InnerText).ToList();
private void ParsePTags()
this.Dom.P = this.Document.DocumentNode
.Descendants("p").Select(x => x.InnerText).ToList();
private void ParseImgUrls()
List<string> imageUrls = this.Document.DocumentNode
.Descendants("img").Select(x => x.Attributes["src"].Value).ToList();
for (int i = imageUrls.Count - 1; i >= 0; i--)
// remove embedded image
// check for absolute path
Uri result;
Uri.TryCreate(imageUrls[i], UriKind.Absolute, out result);
// try to make absolute path
if (result == null)
Uri.TryCreate(new Uri(new Uri(this.Dom.Domain), imageUrls[i]).AbsolutePath, UriKind.Absolute, out result);
// update if successful
if (result != null)
imageUrls[i] = result.AbsoluteUri.Replace("file", "http");
else imageUrls.RemoveAt(i);
this.Dom.ImageUrls = imageUrls;
private void ParseUrls()
List<string> blacklist = new() { "mailto:", "javascript(0);", ".pdf", ".zip", ".doc"};
List<string> urls = new();
foreach (var link in this.Document.DocumentNode.SelectNodes("//a[@href]"))
urls.Add(link.GetAttributeValue("href", string.Empty));
catch { continue; }
// remove bad paths and fix/add relative paths
for (int i = urls.Count - 1; i >= 0; i--)
if (string.IsNullOrEmpty(urls[i]))
// blacklist keywords
if (blacklist.Any(x => urls[i].Contains(x)))
// check for absolute path
Uri result;
Uri.TryCreate(urls[i], UriKind.Absolute, out result);
// try to make absolute path
if (result == null)
Uri.TryCreate(new Uri(new Uri(this.Dom.Domain), urls[i]).AbsoluteUri, UriKind.Absolute, out result);
// update if successful
if (result != null)
urls[i] = result.AbsoluteUri;
else urls.RemoveAt(i);
// parse queries
this.Dom.QueryUrls = new();
for (int i = urls.Count - 1; i >= 0; i--)
if (urls[i].Contains("?"))
this.Dom.QueryUrls.Add(new Uri(urls[i]).Query);
this.Dom.InternalUrls = new();
this.Dom.ExternalUrls = new();
// sort internal / external links
for (int i = urls.Count - 1; i >= 0; i--)
if (!urls[i].Contains(this.Dom.Domain))
public void Dispose()
if (!_disposed)
_disposed = true;
this.Dom = null;
this.Document = null;
using System.Collections.Generic;
namespace HtmlParser
public class PageDom
public PageDom(string url)
this.Url = url;
// General Dom Info
public string Url { get; set; }
public string Domain { get; set; }
// Page Meta Elements
public string Title { get; set; }
public string Language { get; set; }
public string Author { get; set; }
public string Description { get; set; }
public string Keywords { get; set; }
public string Robots { get; set; }
// Page Element Lists
public List<string> H1 { get; set; }
public List<string> H2 { get; set; }
public List<string> H3 { get; set; }
public List<string> H4 { get; set; }
public List<string> H5 { get; set; }
public List<string> H6 { get; set; }
public List<string> P { get; set; }
// Page Urls
public List<string> ImageUrls { get; set; }
public List<string> InternalUrls { get; set; }
public List<string> ExternalUrls { get; set; }
public List<string> QueryUrls { get; set; }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment