Skip to content

Instantly share code, notes, and snippets.

@DanielBaumert
Last active July 1, 2023 22:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DanielBaumert/1b4f0ed1793bef4458648457f1af07d5 to your computer and use it in GitHub Desktop.
Save DanielBaumert/1b4f0ed1793bef4458648457f1af07d5 to your computer and use it in GitHub Desktop.
HTML parse - example
using System.Collections;
using System.Runtime.CompilerServices;
using System.Text;
namespace EasyWeb;
public class HtmlParser
{
private Stack<HtmlElement?> _rootElements = null!;
private HtmlElement? _rootElement;
private Stack<HtmlElement?> _previewSiblings = null!;
private HtmlElement? _previewSibling;
TagElement? _body;
private IDictionary<string, HtmlElement> _idElements = null!;
private static readonly HashSet<string> s_inlineTags = new HashSet<string>
{
"meta",
"link",
"br",
"col",
"wbr",
"img"
};
public HtmlDocument ParseHtml(ReadOnlySpan<char> htmlSource)
{
int i = 0;
_rootElements = new Stack<HtmlElement?>();
_previewSiblings = new Stack<HtmlElement?>();
_idElements = new Dictionary<string, HtmlElement>();
HtmlDocument doc = new HtmlDocument(_idElements, ParseHtml(htmlSource, ref i), _body);
return doc;
}
private HtmlElement[] ParseHtml(ReadOnlySpan<char> htmlSource, ref int i)
{
List<HtmlElement> elements = new List<HtmlElement>();
while (IsSpace(htmlSource[i]))
{
i++;
}
int start = i;
while (i < htmlSource.Length && IsNotCloseTagSig(htmlSource, i)) //didn't found close tag
{
if (htmlSource[i] is '<' && htmlSource[i + 1] is not '<') // new tag starting
{
if (i == start)
{
// no text is between the last tag-closing char and the tag-open char
string? idValue = null;
HtmlElement element = ParseTagOrComment(htmlSource, ref i, ref idValue);
elements.Add(element);
if(idValue != null)
{
_idElements.Add(idValue, element);
}
// skip spaces behind a element
while (i < htmlSource.Length && IsSpace(htmlSource[i]))
{
i++;
}
// set a new start for the next element
start = i;
}
else
{
// text is between the last tag-closing char and the tag-open char
elements.Add(SetSibling(new TextElement { SpanStart = start, SpanEnd = i, Value = new string(htmlSource[start..i]), Root = _rootElement }));
// skip spaces behind a element
while (i < htmlSource.Length && IsSpace(htmlSource[i]))
{
i++;
}
// set a new start for the next element
start = i;
}
}
else if (htmlSource[i] is '<' && htmlSource[i + 1] is '<')
{
i += 2;
while (htmlSource[i] is not '>') // skip ahead until > is found
{
i++;
}
i++;
}
else
{
i++;
}
}
if (i != start)
{
elements.Add(SetSibling(new TextElement { SpanStart = start, SpanEnd = i, Value = new string(htmlSource[start..i]), Root = _rootElement }));
}
return elements.ToArray();
}
private HtmlElement ParseTagOrComment(ReadOnlySpan<char> htmlSource, ref int i, ref string? idValue)
{
char charAt = htmlSource[++i];
switch (charAt)
{
case >= 'a' and <= 'z':
return ParseTag(htmlSource, ref i, ref idValue);
case '!':
int commentStart = i - 1; // get the '<'-char back
if (IsDocTypeElement(htmlSource, i + 1)) // DOCTYPE
{
i += 7; // skip 'DOCTYPE'
while (htmlSource[i] is not '>') // end if open tag
{
i++;
}
i++;
string commentContent = new string(htmlSource[commentStart..i]); // '<!':commentContent:'>'
return SetSibling(new CommentElement { SpanStart = commentStart, SpanEnd = i, Value = commentContent, Root = _rootElement }); ;
}
else if (IsCommentOpen(htmlSource, commentStart))
{
commentStart = (i += 3); // skip '<!--'
while (!IsCommentClose(htmlSource, i))
{
i++;
}
string commentContent = new string(htmlSource[commentStart..i]); // '<!--':commentContent:'-->'
i += 3; // skip '-->'
return SetSibling(new CommentElement { SpanStart = commentStart, SpanEnd = i, Value = commentContent }); ;
}
else
{
throw new FormatException();
}
default:
throw new FormatException($"The character '{charAt}' isn't recognized!");
}
}
private HtmlElement ParseTag(ReadOnlySpan<char> htmlSource, ref int i, ref string? idValue)
{
int tagStart = i - 1; // get the open tag symbol (<)
TagElement element = new TagElement { Tag = ParseTagName(htmlSource, ref i) };
while (true)
{
switch (htmlSource[i])
{
case ' ' or '\n' or '\r' or '\t':
i++;
continue;
case >= 'a' and <= 'z' or '_': // attribute
element.Attributes = ParseAttributes(htmlSource, ref i, element, ref idValue).ToArray();
break;
case '/':
i++;
if (htmlSource[i] is not '>')
{
throw new FormatException();
}
element.SpanStart = tagStart;
element.SpanEnd = i;
element.Root = _rootElement;
return SetSibling(element);
case '>':
i++;
if (s_inlineTags.Contains(element.Tag))
{
element.SpanStart = tagStart;
if(htmlSource[i - 1] is '/')
{
element.SpanEnd = i;
}
element.Root = _rootElement;
return SetSibling(element);
}
if (CheckSkipTags(htmlSource, ref i, element.Tag, out int skipRange))
{
i += skipRange;
element.SpanStart = tagStart;
element.SpanEnd = i;
element.Root = _rootElement;
return SetSibling(element);
}
element.SpanStart = tagStart;
element.SpanEnd = i;
element.Root = _rootElement;
// save last root
_rootElements.Push(_rootElement);
_rootElement = element;
// clear sibling but save current
_previewSiblings.Push(_previewSibling);
_previewSibling = null;
// get child elements
element.Elements = ParseHtml(htmlSource, ref i);
// get last sibling
_previewSibling = _previewSiblings.Pop();
// get preview root
_rootElement = _rootElements.Pop();
// check end tag - equals
if (IsNotCloseTagSig(htmlSource, i))
{
throw new FormatException();
}
i += 2;
int endTagNameOffset = i;
while (i - endTagNameOffset < element.Tag.Length && htmlSource[i] == element.Tag[i - endTagNameOffset])
{
i++;
}
i++; // skip '>'
// end - check end tag - equals
if (element.Tag.Equals("body"))
{
_body = element;
}
return SetSibling(element);
default:
throw new FormatException();
}
}
}
private static bool IsNotCloseTagSig(ReadOnlySpan<char> htmlSource, int i)
{
return htmlSource[i] is not '<' || htmlSource[i + 1] is not '/';
}
private static string ParseTagName(ReadOnlySpan<char> htmlSource, ref int i)
{
int tagNameStart = i;
i++;
while (IsAsciiAlphaNumeric(htmlSource[i]))
{
i++;
}
return new string(htmlSource[tagNameStart..i]);
}
private static HtmlAttributeBase[] ParseAttributes(ReadOnlySpan<char> htmlSource, ref int i, TagElement element, ref string? idValue)
{
List<HtmlAttributeBase> attributes = new List<HtmlAttributeBase>();
do
{
HtmlAttributeBase attribute = ParseAttribute(htmlSource, ref i);
if (attribute is HtmlAttributeText textAttribute)
{
switch (textAttribute.Name.Trim()) // TODO: check if I can trim here
{
case "class":
element.Classes = textAttribute.Value.Split(' ');
break;
case "id":
idValue = element.Id = textAttribute.Value;
break;
default:
attributes.Add(attribute);
break;
}
}
else
{
attributes.Add(attribute);
}
// skip spaces behind an attribute
while (IsSpace(htmlSource[i]))
{
i++;
}
// loop if not a end element /> or >
}
while (htmlSource[i] is not ('>' or '/'));
return attributes.ToArray();
}
private static HtmlAttributeBase ParseAttribute(ReadOnlySpan<char> htmlSource, ref int i)
{
int firstLetter = i;
i++; // next letter:
while (htmlSource[i] is (>= 'a' and <= 'z') or '_' or '-')
{
i++;
}
string attributeName = new string(htmlSource[firstLetter..i]);
int virtualI = i;
while (IsSpace(htmlSource[virtualI]))
{
virtualI++;
}
if (htmlSource[virtualI] is '=')
{
i = virtualI;
return ReadAttributeType(htmlSource, ref i, firstLetter, attributeName);
}
return new HtmlAttribute { SpanStart = firstLetter, SpanEnd = i - 1, Name = attributeName };
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static HtmlAttributeBase ReadAttributeType(ReadOnlySpan<char> htmlSource, ref int i, int attributeNameStart, string attributeName)
{
i++;
while (IsSpace(htmlSource[i]))
{
i++;
}
return char.ToLower(htmlSource[i]) switch
{
't' => new HtmlAttributeBoolean { Name = attributeName, SpanStart = attributeNameStart, Value = ReadTrue(htmlSource, ref i), SpanEnd = i++ },
'f' => new HtmlAttributeBoolean { Name = attributeName, SpanStart = attributeNameStart, Value = ReadFalse(htmlSource, ref i), SpanEnd = i++ },
'"' => new HtmlAttributeDoubleQuotedText { Name = attributeName, SpanStart = attributeNameStart, Value = ReadDoubleQuotedString(htmlSource, ref i), SpanEnd = i - 1 },
'\'' => new HtmlAttributeSingleQuotedText { Name = attributeName, SpanStart = attributeNameStart, Value = ReadSingleQuotedString(htmlSource, ref i), SpanEnd = i - 1 },
>= '0' and <= '9' => new HtmlAttributeNumeric { Name = attributeName, SpanStart = attributeNameStart, Value = ReadUInt(htmlSource, ref i), SpanEnd = i - 1 },
_ => throw new FormatException()
};
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static string ReadDoubleQuotedString(ReadOnlySpan<char> htmlSource, ref int i)
{
int start = ++i;
while (htmlSource[i] != '"')
{
i++;
}
return new string(htmlSource[start..i++]);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static string ReadSingleQuotedString(ReadOnlySpan<char> htmlSource, ref int i)
{
int start = ++i;
while (htmlSource[i] is not '\'')
{
i++;
}
return new string(htmlSource[start..i++]);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static uint ReadUInt(ReadOnlySpan<char> htmlSource, ref int i)
{
int start = i;
i++;
while (IsNumber(htmlSource[i]))
{
i++;
}
return uint.Parse(new string(htmlSource[start..(i - 1)]));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool IsDocTypeElement(ReadOnlySpan<char> htmlSource, int i)
{
return htmlSource[i++] is 'D' &&
htmlSource[i++] is 'O' &&
htmlSource[i++] is 'C' &&
htmlSource[i++] is 'T' &&
htmlSource[i++] is 'Y' &&
htmlSource[i++] is 'P' &&
htmlSource[i] is 'E';
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool IsCommentOpen(ReadOnlySpan<char> htmlSource, int i)
{
return htmlSource[i++] is '<' &&
htmlSource[i++] is '!' &&
htmlSource[i++] is '-' &&
htmlSource[i] is '-';
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool IsCommentClose(ReadOnlySpan<char> htmlSource, int i)
{
return htmlSource[i++] is '-' &&
htmlSource[i++] is '-' &&
htmlSource[i] is '>';
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool IsStyleCloseTag(ReadOnlySpan<char> htmlSource, int i)
{
return htmlSource[i++] is '<' &&
htmlSource[i++] is '/' &&
htmlSource[i++] is 's' &&
htmlSource[i++] is 't' &&
htmlSource[i++] is 'y' &&
htmlSource[i++] is 'l' &&
htmlSource[i++] is 'e' &&
htmlSource[i] is '>';
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool IsSvgCloseTag(ReadOnlySpan<char> htmlSource, int i)
{
return htmlSource[i++] is '<' &&
htmlSource[i++] is '/' &&
htmlSource[i++] is 's' &&
htmlSource[i++] is 'v' &&
htmlSource[i++] is 'g' &&
htmlSource[i] is '>';
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool IsScriptCloseTag(ReadOnlySpan<char> htmlSource, int i)
{
return htmlSource[i++] is '<' &&
htmlSource[i++] is '/' &&
htmlSource[i++] is 's' &&
htmlSource[i++] is 'c' &&
htmlSource[i++] is 'r' &&
htmlSource[i++] is 'i' &&
htmlSource[i++] is 'p' &&
htmlSource[i++] is 't' &&
htmlSource[i] is '>';
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool ReadTrue(ReadOnlySpan<char> htmlSource, ref int i)
{
if (htmlSource[++i] is not 'r')
{
throw new FormatException();
}
if (htmlSource[++i] is not 'u')
{
throw new FormatException();
}
if (htmlSource[i] is not 'e')
{
throw new FormatException();
}
return true;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool ReadFalse(ReadOnlySpan<char> htmlSource, ref int i)
{
if (htmlSource[++i] is not 'a')
{
throw new FormatException();
}
if (htmlSource[++i] is not 'l')
{
throw new FormatException();
}
if (htmlSource[++i] is not 's')
{
throw new FormatException();
}
if (htmlSource[i] is not 'e')
{
throw new FormatException();
}
return false;
}
/// <summary>
/// Check if the char is ' ' or '\r' or '\n' or '\t'
/// </summary>
/// <param name="c">Char to check</param>
/// <returns><see langword="true" /> char is ' ' or '\r' or '\n' or '\t'; Otherwise <see langword="false" /></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool IsSpace(char c)
{
return c is /*' ' or*/ '\r' or '\n' or '\t';
}
/// <summary>
/// Check if the char is between '0' and '9'
/// </summary>
/// <param name="c">Char to check</param>
/// <returns>
/// <see langword="true" /> char is >= '0' and <= '9'; Otherwise <see langword="false" />
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool IsNumber(char c)
{
return c is >= '0' and <= '9';
}
/// <summary>
/// Check if the char between 'a' and 'z'
/// </summary>
/// <param name="c">char to check</param>
/// <returns><see langword="true" />, if the char between; Otherwise <see langword="false" /></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool IsAsciiAlpha(char c)
{
return char.ToLower(c) is (>= 'a' and <= 'z');
}
/// <summary>
/// Check if the char between 'a' and 'z' or between '0' and '9'
/// </summary>
/// <param name="c">char to check</param>
/// <returns><see langword="true" />, if the char between; Otherwise <see langword="false" /></returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool IsAsciiAlphaNumeric(char c)
{
return char.ToLower(c) is (>= 'a' and <= 'z') or (>= '0' and <= '9');
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool CheckSkipTags(ReadOnlySpan<char> htmlSource, ref int i, string tagName, out int skipRange)
{
if (tagName.Equals("style"))
{
while (!IsStyleCloseTag(htmlSource, i))
{
i++;
}
skipRange = 8;
return true;
}
if (tagName.Equals("svg"))
{
while (!IsSvgCloseTag(htmlSource, i))
{
i++;
}
skipRange = 6;
return true;
}
if (tagName.Equals("script"))
{
while (!IsScriptCloseTag(htmlSource, i))
{
i++;
}
skipRange = 9;
return true;
}
skipRange = 0;
return false;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private HtmlElement SetSibling(HtmlElement nextElement)
{
nextElement.PreviewSibling = _previewSibling;
if (_previewSibling != null)
{
_previewSibling.NextSibling = nextElement;
}
return _previewSibling = nextElement;
}
}
public class HtmlDocument
{
public IDictionary<string, HtmlElement> IDs { get;}
public HtmlElement[] Root { get; }
public TagElement? Body { get; }
public HtmlDocument(IDictionary<string, HtmlElement> ids, HtmlElement[] root, TagElement? body)
{
IDs = ids;
Root = root;
Body = body;
}
}
public abstract class HtmlElement
{
public int SpanStart { get; internal set; }
public int SpanEnd { get; internal set; }
public HtmlElement? Root { get; internal set; }
public HtmlElement? PreviewSibling { get; internal set; }
public HtmlElement? NextSibling { get; internal set; }
public abstract string ToHtml(Action<HtmlElement> query);
}
public class TagElement : HtmlElement
{
public HtmlElement? this[int index]
{
get
{
return Elements?[index] ?? null;
}
}
public string Tag { get; internal set; } = null!;
public HtmlAttributeBase[]? Attributes { get; internal set; } = null;
public HtmlElement[]? Elements { get; internal set; } = null;
public string[]? Classes { get; internal set; } = null;
public string? Id { get; internal set; } = null;
public IEnumerable<TagElement> GetElementsByTagName(string tagName, StringComparison comparison = StringComparison.OrdinalIgnoreCase, bool processChildElements = false)
{
if (Elements is not null && Elements.Length > 0)
{
HtmlElement? element = Elements.FirstOrDefault();
if(element is not null)
{
do
{
if (element is TagElement htmlElement)
{
if (htmlElement.Tag.Equals(tagName, comparison))
{
yield return htmlElement;
}
else if (processChildElements && htmlElement.Elements != null)
{
foreach (TagElement item in htmlElement.GetElementsByTagName(tagName, comparison, processChildElements))
{
yield return item;
}
}
}
element = element.NextSibling;
}
while (element != null);
}
}
}
public TagElement? GetFirstElementByTagName(string tagName, StringComparison comparison = StringComparison.OrdinalIgnoreCase, bool processChildElements = false)
{
if (Elements != null && Elements.Length > 0)
{
HtmlElement? element = Elements.FirstOrDefault();
if (element is not null)
{
do
{
if (element is TagElement htmlElement)
{
if (htmlElement.Tag.Equals(tagName, comparison))
{
return htmlElement;
}
if (processChildElements && htmlElement.Elements != null)
{
TagElement? ret = htmlElement.GetFirstElementByTagName(tagName, comparison, processChildElements);
if (ret != null)
{
return ret;
}
}
}
element = element.NextSibling;
}
while (element != null);
}
}
return null;
}
public TagElement? GetFirstElement(Func<TagElement, bool> comp, bool processChildElements = false)
{
if (Elements != null && Elements.Length > 0)
{
HtmlElement? element = Elements[0];
do
{
if (element is TagElement htmlElement)
{
if (comp(htmlElement))
{
return htmlElement;
}
if (processChildElements && htmlElement.Elements != null)
{
TagElement? ret = htmlElement.GetFirstElement(comp, processChildElements);
if (ret != null)
{
return ret;
}
}
}
element = element.NextSibling;
}
while (element != null);
}
return null;
}
public IEnumerable<TagElement> Query(Func<TagElement, bool> query, bool processChildElements = false)
{
if (Elements != null && Elements.Length > 0)
{
HtmlElement? element = Elements[0];
do
{
if (element is TagElement htmlElement)
{
if (query(htmlElement))
{
yield return htmlElement;
}
else if (processChildElements)
{
foreach (TagElement item in htmlElement.Query(query, true))
{
yield return item;
}
}
}
element = element.NextSibling;
}
while (element != null);
}
}
public bool HasParentByTagName(string tagName)
{
TagElement? parent = (TagElement?)Root;
while (parent != null)
{
if (parent.Tag.Equals(tagName))
{
return true;
}
parent = (TagElement?)parent.Root;
}
return false;
}
public bool TryGetParentByTagName(string tagName, out TagElement? parent)
{
parent = (TagElement?)Root;
while (parent is not null)
{
if (parent.Tag.Equals(tagName))
{
return true;
}
parent = (TagElement?)parent.Root;
}
return false;
}
public string ToText()
{
StringBuilder builder = new StringBuilder();
ToText(builder);
return builder.ToString();
}
public void ToText(StringBuilder builder)
{
if (Elements is not null && Elements.Length > 0)
{
HtmlElement? element = Elements.FirstOrDefault();
if(element is not null)
{
do
{
if (element is TextElement textElement)
{
builder.Append(textElement.Value);
}
else if (element is TagElement htmlElement)
{
htmlElement.ToText(builder);
}
element = element.NextSibling;
}
while (element != null);
}
}
}
public override string ToHtml(Action<HtmlElement> query)
{
query(this);
StringBuilder sb = new StringBuilder()
.Append('<')
.Append(Tag);
if (Id != null)
{
sb.Append($" id=\"{Id}\"");
}
if (Classes != null && Classes.Length > 0)
{
sb.Append($" class=\"{string.Join(' ', Classes)}\"");
}
if (Attributes != null && Attributes.Length > 0)
{
sb.Append(' ')
.Append(string.Join(' ', Attributes.Select(x => x.ToString())));
}
if (Elements != null && Elements.Length > 0)
{
sb.Append('>')
.Append(string.Concat(Elements.Select(e => e.ToHtml(query))))
.Append($"</{Tag}>");
}
else
{
sb.Append("/>");
}
return sb.ToString();
}
public override string ToString()
{
StringBuilder sb = new StringBuilder()
.Append('<')
.Append(Tag);
if (Id != null)
{
sb.Append($" id=\"{Id}\"");
}
if (Classes != null && Classes.Length > 0)
{
sb.Append($" class=\"{string.Join(' ', Classes)}\"");
}
if (Attributes != null && Attributes.Length > 0)
{
sb.Append(' ')
.Append(string.Join(' ', Attributes.Select(x => x.ToString())));
}
if (Elements != null && Elements.Length > 0)
{
sb.Append('>')
.Append(string.Concat(Elements.Select(e => e.ToString())))
.Append($"</{Tag}>");
}
else
{
sb.Append("/>");
}
return sb.ToString();
}
}
public class TextElement : HtmlElement
{
public string Value { get; internal set; } = null!;
public override string ToHtml(Action<HtmlElement> query)
{
query(this);
return Value ?? string.Empty;
}
public override string ToString()
{
return Value ?? string.Empty;
}
}
public class CommentElement : HtmlElement
{
public string Value { get; internal set; } = null!;
public override string ToHtml(Action<HtmlElement> query)
{
query(this);
return $"<!-- {Value} -->";
}
public override string ToString()
{
return $"<!-- {Value} -->";
}
}
public abstract class HtmlAttributeBase
{
public int SpanStart { get; internal set; }
public int SpanEnd { get; internal set; }
public string Name { get; internal init; } = null!;
}
public class HtmlAttribute : HtmlAttributeBase
{
public override string ToString()
{
return Name;
}
}
public abstract class HtmlAttributeText : HtmlAttributeBase
{
public string Value { get; set; } = null!;
}
public class HtmlAttributeDoubleQuotedText : HtmlAttributeText
{
public override string ToString()
{
return $"{Name}=\"{Value}\"";
}
}
public class HtmlAttributeSingleQuotedText : HtmlAttributeText
{
public override string ToString()
{
return $"{Name}='{Value}'";
}
}
public class HtmlAttributeBoolean : HtmlAttributeBase
{
public bool Value { get; internal init; } = false!;
public override string ToString()
{
return $"{Name}={Value}";
}
}
public class HtmlAttributeNumeric : HtmlAttributeBase
{
public uint Value { get; internal init; } = 0u;
public override string ToString()
{
return $"{Name}={Value}";
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment