michaelrinderle/DomParser.cs

## app.cs
using System;

namespace HtmlParser
{
    class Program
    {
        static void Main(string[] args)
        {
            try
            {
                using var parser = new DomParser("https://www.michaelrinderle.com");
                PageDom dom = parser.GetPageDom();
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex);
            }
        }
    }
}

## DomParser.cs
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net;
using System.Text;

namespace HtmlParser
{
    public class DomParser : IDisposable
    {
        string Url;

        HtmlDocument Document;

        PageDom Dom;

        private bool _disposed;

        public DomParser(string url)
        {
            this.Load(url, string.Empty);
        }

        public DomParser(string url, string pageSource)
        {
            this.Load(url, pageSource);
        }

        public void Load(string url, string pageSource)
        {
            this.Url = url;
            this.Document = new();

            if (string.IsNullOrEmpty(pageSource))
            {
                try
                {
                    var request = (HttpWebRequest)WebRequest.Create(url);
                    request.Method = "GET";

                    using var response = (HttpWebResponse)request.GetResponse();
                    using var stream = response.GetResponseStream();
                    this.Document.Load(stream, Encoding.GetEncoding("iso-8859-9"));
                }
                catch
                {
                    throw new Exception("Cannot retrieve page source.");
                }
            }
            else
                this.Document.LoadHtml(pageSource);

            if (this.Document.ParseErrors == null)
                throw new Exception("Invalid Page Source");
        }

        public PageDom GetPageDom()
        {
            this.Dom = new(this.Url);
            this.Dom.Domain = new Uri(this.Url).Host;

            this.ParseMetaTags();
            this.ParseHeaderTags();
            this.ParsePTags();
            this.ParseImgUrls();
            this.ParseUrls();

            return this.Dom;
        }

        private void ParseMetaTags()
        {
            this.Dom.Title = this.Document.DocumentNode
                .Descendants("title").SingleOrDefault().InnerText;

            this.Dom.Language = this.Document.DocumentNode
                .SelectSingleNode("//meta[@name='language']").Attributes["content"].Value;

            this.Dom.Author = this.Document.DocumentNode
                .SelectSingleNode("//meta[@name='author']").Attributes["content"].Value;

            this.Dom.Description = this.Document.DocumentNode
                .SelectSingleNode("//meta[@name='description']").Attributes["content"].Value;

            this.Dom.Keywords = this.Document.DocumentNode
                .SelectSingleNode("//meta[@name='keywords']").Attributes["content"].Value;

            this.Dom.Robots = this.Document.DocumentNode
                .SelectSingleNode("//meta[@name='robots']").Attributes["content"].Value;
        }

        private void ParseHeaderTags()
        {
            this.Dom.H1 = this.Document.DocumentNode
                .Descendants("h1").Select(x => x.InnerText).ToList();

            this.Dom.H2 = this.Document.DocumentNode
                .Descendants("h2").Select(x => x.InnerText).ToList();

            this.Dom.H3 = this.Document.DocumentNode
                .Descendants("h3").Select(x => x.InnerText).ToList();

            this.Dom.H4 = this.Document.DocumentNode
                .Descendants("h4").Select(x => x.InnerText).ToList();

            this.Dom.H5 = this.Document.DocumentNode
                .Descendants("h5").Select(x => x.InnerText).ToList();

            this.Dom.H6 = this.Document.DocumentNode
                .Descendants("h6").Select(x => x.InnerText).ToList();
        }

        private void ParsePTags()
        {
            this.Dom.P = this.Document.DocumentNode
                .Descendants("p").Select(x => x.InnerText).ToList();
        }

        private void ParseImgUrls()
        {
            List<string> imageUrls = this.Document.DocumentNode
                .Descendants("img").Select(x => x.Attributes["src"].Value).ToList();

            for (int i = imageUrls.Count - 1; i >= 0; i--)
            {
                // remove embedded image
                if(imageUrls[i].Contains("data:image"))
                {
                    imageUrls.RemoveAt(i);
                    continue;
                }

                // check for absolute path
                Uri result;
                Uri.TryCreate(imageUrls[i], UriKind.Absolute, out result);

                // try to make absolute path
                if (result == null)
                    Uri.TryCreate(new Uri(new Uri(this.Dom.Domain), imageUrls[i]).AbsolutePath, UriKind.Absolute, out result);

                // update if successful
                if (result != null)
                {
                    imageUrls[i] = result.AbsoluteUri.Replace("file", "http");
                }
                else imageUrls.RemoveAt(i);
            }

            this.Dom.ImageUrls = imageUrls;
        }

        private void ParseUrls()
        {
            List<string> blacklist = new() { "mailto:", "javascript(0);", ".pdf", ".zip", ".doc"};
            List<string> urls = new();

            foreach (var link in this.Document.DocumentNode.SelectNodes("//a[@href]"))
            {
                try
                {
                    urls.Add(link.GetAttributeValue("href", string.Empty));
                }
                catch { continue; }
            }

            // remove bad paths and fix/add relative paths
            for (int i = urls.Count - 1; i >= 0; i--)
            {
                if (string.IsNullOrEmpty(urls[i]))
                {
                    urls.RemoveAt(i);
                    continue;
                }

                // blacklist keywords
                if (blacklist.Any(x => urls[i].Contains(x)))
                {
                    urls.RemoveAt(i);
                    continue;
                }

                // check for absolute path
                Uri result;
                Uri.TryCreate(urls[i], UriKind.Absolute, out result);

                // try to make absolute path
                if (result == null)
                    Uri.TryCreate(new Uri(new Uri(this.Dom.Domain), urls[i]).AbsoluteUri, UriKind.Absolute, out result);

                // update if successful
                if (result != null)
                {
                    urls[i] = result.AbsoluteUri;
                }
                else urls.RemoveAt(i);
            }

            // parse queries
            this.Dom.QueryUrls = new();
            for (int i = urls.Count - 1; i >= 0; i--)
            {
                if (urls[i].Contains("?"))
                {
                    this.Dom.QueryUrls.Add(new Uri(urls[i]).Query);
                    urls.RemoveAt(i);
                }
            }

            this.Dom.InternalUrls = new();
            this.Dom.ExternalUrls = new();

            // sort internal / external links
            for (int i = urls.Count - 1; i >= 0; i--)
            {
                if (!urls[i].Contains(this.Dom.Domain))
                    this.Dom.ExternalUrls.Add(urls[i]);

                else
                    this.Dom.InternalUrls.Add(urls[i]);

                urls.RemoveAt(i);
            }
        }

        public void Dispose()
        {
            if (!_disposed)
            {
                _disposed = true;
                this.Dom = null;
                this.Document = null;
                GC.SuppressFinalize(this);
            }
        }
    }
}

## PageDom.cs
using System.Collections.Generic;

namespace HtmlParser
{
    public class PageDom
    {
        public PageDom(string url)
        {
            this.Url = url;
        }

        // General Dom Info
        public string Url { get; set; }
        public string Domain { get; set; }

        // Page Meta Elements
        public string Title { get; set; }
        public string Language { get; set; }
        public string Author { get; set; }
        public string Description { get; set; }
        public string Keywords { get; set; }
        public string Robots { get; set; }

        // Page Element Lists
        public List<string> H1 { get; set; }
        public List<string> H2 { get; set; }
        public List<string> H3 { get; set; }
        public List<string> H4 { get; set; }
        public List<string> H5 { get; set; }
        public List<string> H6 { get; set; }
        public List<string> P { get; set; }

        // Page Urls
        public List<string> ImageUrls { get; set; }
        public List<string> InternalUrls { get; set; }
        public List<string> ExternalUrls { get; set; }
        public List<string> QueryUrls { get; set; }
    }
}
	using System;

	namespace HtmlParser
	{
	class Program
	{
	static void Main(string[] args)
	{
	try
	{
	using var parser = new DomParser("https://www.michaelrinderle.com");
	PageDom dom = parser.GetPageDom();
	}
	catch (Exception ex)
	{
	Console.WriteLine(ex);
	}
	}
	}
	}
	using HtmlAgilityPack;
	using System;
	using System.Collections.Generic;
	using System.Linq;
	using System.Net;
	using System.Text;

	namespace HtmlParser
	{
	public class DomParser : IDisposable
	{
	string Url;

	HtmlDocument Document;

	PageDom Dom;

	private bool _disposed;

	public DomParser(string url)
	{
	this.Load(url, string.Empty);
	}

	public DomParser(string url, string pageSource)
	{
	this.Load(url, pageSource);
	}

	public void Load(string url, string pageSource)
	{
	this.Url = url;
	this.Document = new();

	if (string.IsNullOrEmpty(pageSource))
	{
	try
	{
	var request = (HttpWebRequest)WebRequest.Create(url);
	request.Method = "GET";

	using var response = (HttpWebResponse)request.GetResponse();
	using var stream = response.GetResponseStream();
	this.Document.Load(stream, Encoding.GetEncoding("iso-8859-9"));
	}
	catch
	{
	throw new Exception("Cannot retrieve page source.");
	}
	}
	else
	this.Document.LoadHtml(pageSource);

	if (this.Document.ParseErrors == null)
	throw new Exception("Invalid Page Source");
	}

	public PageDom GetPageDom()
	{
	this.Dom = new(this.Url);
	this.Dom.Domain = new Uri(this.Url).Host;

	this.ParseMetaTags();
	this.ParseHeaderTags();
	this.ParsePTags();
	this.ParseImgUrls();
	this.ParseUrls();

	return this.Dom;
	}

	private void ParseMetaTags()
	{
	this.Dom.Title = this.Document.DocumentNode
	.Descendants("title").SingleOrDefault().InnerText;

	this.Dom.Language = this.Document.DocumentNode
	.SelectSingleNode("//meta[@name='language']").Attributes["content"].Value;

	this.Dom.Author = this.Document.DocumentNode
	.SelectSingleNode("//meta[@name='author']").Attributes["content"].Value;

	this.Dom.Description = this.Document.DocumentNode
	.SelectSingleNode("//meta[@name='description']").Attributes["content"].Value;

	this.Dom.Keywords = this.Document.DocumentNode
	.SelectSingleNode("//meta[@name='keywords']").Attributes["content"].Value;

	this.Dom.Robots = this.Document.DocumentNode
	.SelectSingleNode("//meta[@name='robots']").Attributes["content"].Value;
	}

	private void ParseHeaderTags()
	{
	this.Dom.H1 = this.Document.DocumentNode
	.Descendants("h1").Select(x => x.InnerText).ToList();

	this.Dom.H2 = this.Document.DocumentNode
	.Descendants("h2").Select(x => x.InnerText).ToList();

	this.Dom.H3 = this.Document.DocumentNode
	.Descendants("h3").Select(x => x.InnerText).ToList();

	this.Dom.H4 = this.Document.DocumentNode
	.Descendants("h4").Select(x => x.InnerText).ToList();

	this.Dom.H5 = this.Document.DocumentNode
	.Descendants("h5").Select(x => x.InnerText).ToList();

	this.Dom.H6 = this.Document.DocumentNode
	.Descendants("h6").Select(x => x.InnerText).ToList();
	}

	private void ParsePTags()
	{
	this.Dom.P = this.Document.DocumentNode
	.Descendants("p").Select(x => x.InnerText).ToList();
	}

	private void ParseImgUrls()
	{
	List<string> imageUrls = this.Document.DocumentNode
	.Descendants("img").Select(x => x.Attributes["src"].Value).ToList();

	for (int i = imageUrls.Count - 1; i >= 0; i--)
	{
	// remove embedded image
	if(imageUrls[i].Contains("data:image"))
	{
	imageUrls.RemoveAt(i);
	continue;
	}

	// check for absolute path
	Uri result;
	Uri.TryCreate(imageUrls[i], UriKind.Absolute, out result);

	// try to make absolute path
	if (result == null)
	Uri.TryCreate(new Uri(new Uri(this.Dom.Domain), imageUrls[i]).AbsolutePath, UriKind.Absolute, out result);

	// update if successful
	if (result != null)
	{
	imageUrls[i] = result.AbsoluteUri.Replace("file", "http");
	}
	else imageUrls.RemoveAt(i);
	}

	this.Dom.ImageUrls = imageUrls;
	}

	private void ParseUrls()
	{
	List<string> blacklist = new() { "mailto:", "javascript(0);", ".pdf", ".zip", ".doc"};
	List<string> urls = new();

	foreach (var link in this.Document.DocumentNode.SelectNodes("//a[@href]"))
	{
	try
	{
	urls.Add(link.GetAttributeValue("href", string.Empty));
	}
	catch { continue; }
	}

	// remove bad paths and fix/add relative paths
	for (int i = urls.Count - 1; i >= 0; i--)
	{
	if (string.IsNullOrEmpty(urls[i]))
	{
	urls.RemoveAt(i);
	continue;
	}

	// blacklist keywords
	if (blacklist.Any(x => urls[i].Contains(x)))
	{
	urls.RemoveAt(i);
	continue;
	}

	// check for absolute path
	Uri result;
	Uri.TryCreate(urls[i], UriKind.Absolute, out result);

	// try to make absolute path
	if (result == null)
	Uri.TryCreate(new Uri(new Uri(this.Dom.Domain), urls[i]).AbsoluteUri, UriKind.Absolute, out result);

	// update if successful
	if (result != null)
	{
	urls[i] = result.AbsoluteUri;
	}
	else urls.RemoveAt(i);
	}

	// parse queries
	this.Dom.QueryUrls = new();
	for (int i = urls.Count - 1; i >= 0; i--)
	{
	if (urls[i].Contains("?"))
	{
	this.Dom.QueryUrls.Add(new Uri(urls[i]).Query);
	urls.RemoveAt(i);
	}
	}

	this.Dom.InternalUrls = new();
	this.Dom.ExternalUrls = new();

	// sort internal / external links
	for (int i = urls.Count - 1; i >= 0; i--)
	{
	if (!urls[i].Contains(this.Dom.Domain))
	this.Dom.ExternalUrls.Add(urls[i]);

	else
	this.Dom.InternalUrls.Add(urls[i]);

	urls.RemoveAt(i);
	}
	}

	public void Dispose()
	{
	if (!_disposed)
	{
	_disposed = true;
	this.Dom = null;
	this.Document = null;
	GC.SuppressFinalize(this);
	}
	}
	}
	}
	using System.Collections.Generic;

	namespace HtmlParser
	{
	public class PageDom
	{
	public PageDom(string url)
	{
	this.Url = url;
	}

	// General Dom Info
	public string Url { get; set; }
	public string Domain { get; set; }

	// Page Meta Elements
	public string Title { get; set; }
	public string Language { get; set; }
	public string Author { get; set; }
	public string Description { get; set; }
	public string Keywords { get; set; }
	public string Robots { get; set; }

	// Page Element Lists
	public List<string> H1 { get; set; }
	public List<string> H2 { get; set; }
	public List<string> H3 { get; set; }
	public List<string> H4 { get; set; }
	public List<string> H5 { get; set; }
	public List<string> H6 { get; set; }
	public List<string> P { get; set; }

	// Page Urls
	public List<string> ImageUrls { get; set; }
	public List<string> InternalUrls { get; set; }
	public List<string> ExternalUrls { get; set; }
	public List<string> QueryUrls { get; set; }
	}
	}