Created
April 26, 2015 16:51
-
-
Save mouhong/c09487502e261f7ce53d to your computer and use it in GitHub Desktop.
HtmlTagClosing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.Text; | |
namespace HtmlTagClosing | |
{ | |
public static class StringExtensions | |
{ | |
public static string CloseTags(this string html) | |
{ | |
if (String.IsNullOrWhiteSpace(html)) | |
{ | |
return html; | |
} | |
return new TagCloser().CloseTags(html); | |
} | |
class TagCloser | |
{ | |
private Stack<string> _openTags = new Stack<string>(); | |
private string _html; | |
private int _currentCharIndex; | |
private bool Eof | |
{ | |
get { return _currentCharIndex == _html.Length - 1; } | |
} | |
private int _pendingTagNameStartIndex; | |
private string _pendingTagName; | |
private int _currentEndTagStartIndex; | |
private State _state; | |
public string CloseTags(string html) | |
{ | |
_html = html; | |
_currentCharIndex = -1; | |
while (!Eof) | |
{ | |
var ch = ReadNext(); | |
if (_state == State.InTagName) | |
{ | |
// Self closing | |
if (ch == '/' && PeekNext() == '>') | |
{ | |
ReadNext(); | |
_state = _openTags.Count > 0 ? State.InsideTag : State.None; | |
continue; | |
} | |
if (ch == ' ' || ch == '>') | |
{ | |
_pendingTagName = _html.Substring(_pendingTagNameStartIndex, _currentCharIndex - _pendingTagNameStartIndex); | |
} | |
if (ch == ' ') | |
{ | |
_state = State.InAttributes; | |
} | |
else if (ch == '>') | |
{ | |
_openTags.Push(_pendingTagName); | |
_state = State.InsideTag; | |
} | |
} | |
else if (_state == State.InsideTag) | |
{ | |
if (ch == '<' && PeekNext() == '/') | |
{ | |
_currentEndTagStartIndex = _currentCharIndex; | |
ReadNext(); | |
_state = State.InEndTag; | |
continue; | |
} | |
if (ch == '<') | |
{ | |
_state = State.InTagName; | |
_pendingTagNameStartIndex = _currentCharIndex + 1; | |
continue; | |
} | |
} | |
else if (_state == State.InEndTag) | |
{ | |
if (ch == '>') | |
{ | |
_openTags.Pop(); | |
_state = _openTags.Count > 0 ? State.InsideTag : State.None; | |
} | |
} | |
else | |
{ | |
if (ch == '<') | |
{ | |
_state = State.InTagName; | |
_pendingTagNameStartIndex = _currentCharIndex + 1; | |
continue; | |
} | |
} | |
} | |
// Broken start tag | |
if (_state == State.InTagName || _state == State.InAttributes) | |
{ | |
_html = _html.Substring(0, _pendingTagNameStartIndex - 1); | |
} | |
// Broken end tag | |
else if (_state == State.InEndTag) | |
{ | |
_html = _html.Substring(0, _currentEndTagStartIndex); | |
} | |
var sb = new StringBuilder(); | |
sb.Append(_html); | |
while (_openTags.Count > 0) | |
{ | |
var tag = _openTags.Pop(); | |
sb.Append("</" + tag + ">"); | |
} | |
return sb.ToString(); | |
} | |
private char ReadNext() | |
{ | |
return Eof ? '\0' : _html[++_currentCharIndex]; | |
} | |
private char PeekNext() | |
{ | |
return Eof ? '\0' : _html[_currentCharIndex + 1]; | |
} | |
enum State | |
{ | |
None, InTagName, InAttributes, InsideTag, InEndTag | |
} | |
} | |
} | |
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
// Missing end tag -> will be closed | |
Console.WriteLine("<div>Hello World".CloseTags()); | |
Console.WriteLine("<div>Hello, <b>World".CloseTags()); | |
// Broken end tag -> will be closed | |
Console.WriteLine("<div>Hello World</di".CloseTags()); | |
Console.WriteLine("<div>Hello, <b>World</".CloseTags()); | |
// Broken start tag -> will be ignored | |
Console.WriteLine("<div>Hello World. <span".CloseTags()); | |
Console.WriteLine("Press any key to continue..."); | |
Console.ReadKey(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi
Great code piece thanks :)
Looks like line 43 need to change as
below
if (_state == State.InTagName || _state == State.InAttributes)
I had a HTML snippet as below
<P>We’re looking forward to seeing you at the ggg gg gg gg, please <A class=debt href="mailto: engage@ggg.ggg.gg?subject=gg%gg"><STRONG><B><U>engage@gg.ggg.gg</B></U></STRONG></A> if you have any questions.</P>|XXXX|<P>Happy paddling!</P>|XXXX|<P>ggg hhh dd& Sport</P>|XXXX|<P>hh hhh | Assistant Events Coordinator | dd& Sport</P>|XXXX|<P>hh 5,hhY h | gg Point | 2 gg Street | dfgdfg gdfg 4001 ||XXXX|e: s17.dfgdfg@fgdfg.gg.gg</P><P><BR>|XXXX||XXXX||XXXX||XXXX|<P><STRONG><B><input type="button" value="Yes, I have read the Guide to gg. Please close page." onclick="self.close()"></B></STRONG></P><BR><BR>