public
Last active

HtmlTruncator

  • Download Gist
HtmlTruncator.cs
C#
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
public static class HtmlTruncator
{
 
public static string LimitOnWordBoundary(string str, int maxLength, string ellipses = "...")
{
XmlDocument doc = new XmlDocument();
 
XmlParserContext context = new XmlParserContext(doc.NameTable, new XmlNamespaceManager(doc.NameTable), null, XmlSpace.Preserve);
XmlTextReader reader = new XmlTextReader("<xml>" + str + "</xml>", XmlNodeType.Document, context);
bool shouldWriteEllipses;
using (var writer = doc.CreateNavigator().AppendChild())
{
LimitOnWordBoundary(writer, reader, maxLength, out shouldWriteEllipses);
writer.Flush();
}
 
return doc.DocumentElement.InnerXml + (shouldWriteEllipses ? ellipses : "");
}
 
public static void LimitOnWordBoundary(XmlWriter writer, XmlReader reader, int maxLength, out bool shouldWriteEllipses)
{
if (reader == null)
{
throw new ArgumentNullException("reader");
}
if (writer == null)
{
throw new ArgumentNullException("writer");
}
 
int elementCount = 0;
 
int currentLength = 0;
shouldWriteEllipses = false;
 
int magicMinimumLength = Math.Min(5, (maxLength + 1) / 2);
 
int num = (reader.NodeType == XmlNodeType.None) ? -1 : reader.Depth;
do
{
bool done = false;
switch (reader.NodeType)
{
case XmlNodeType.Element:
elementCount++;
writer.WriteStartElement(reader.Prefix, reader.LocalName, reader.NamespaceURI);
writer.WriteAttributes(reader, false);
if (reader.IsEmptyElement)
{
elementCount--;
writer.WriteEndElement();
}
break;
 
case XmlNodeType.Text:
string value = reader.Value;
int strLen = value.Length;
 
if (currentLength + strLen > maxLength)
{
string almost = value.Substring(0, maxLength - currentLength + 1);
int lastSpace = almost.LastIndexOf(' ');
if (lastSpace < 0)
{
if (currentLength < magicMinimumLength)
{
value = value.Substring(0, maxLength - currentLength);
}
else
{
value = null;
}
}
else if (lastSpace + currentLength < magicMinimumLength)
{
value = value.Substring(0, maxLength - currentLength);
}
else
{
value = value.Substring(0, lastSpace);
}
shouldWriteEllipses = true;
done = true;
}
if (value != null)
{
writer.WriteString(value);
currentLength += value.Length;
}
break;
 
case XmlNodeType.Whitespace:
case XmlNodeType.SignificantWhitespace:
writer.WriteString(reader.Value);
currentLength += reader.Value.Length;
break;
 
case XmlNodeType.EndElement:
elementCount--;
writer.WriteFullEndElement();
break;
 
case XmlNodeType.CDATA:
//writer.WriteCData(reader.Value);
break;
 
case XmlNodeType.EntityReference:
writer.WriteEntityRef(reader.Name);
currentLength++;
break;
 
case XmlNodeType.ProcessingInstruction:
case XmlNodeType.XmlDeclaration:
//writer.WriteProcessingInstruction(reader.Name, reader.Value);
break;
 
case XmlNodeType.Comment:
//writer.WriteComment(reader.Value);
break;
 
case XmlNodeType.DocumentType:
//writer.WriteDocType(reader.Name, reader.GetAttribute("PUBLIC"), reader.GetAttribute("SYSTEM"), reader.Value);
break;
}
if (done) break;
}
while (reader.Read() && ((num < reader.Depth) || ((num == reader.Depth) && (reader.NodeType == XmlNodeType.EndElement))));
 
while (elementCount > 0)
{
writer.WriteFullEndElement();
elementCount--;
}
}
 
}
HtmlTruncatorTest.cs
C#
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
[TestFixture] // nunit
public class HtmlTruncatorTests
{
 
[Test]
public void StringWithoutTagsShouldNotBreakIfShorter()
{
var xml = HtmlTruncator.LimitOnWordBoundary("aap noot mies", 100, ellipses: "...");
Assert.AreEqual("aap noot mies", xml);
}
 
[Test]
public void StringWithTagsShouldNotBreakIfShorter()
{
var xml = HtmlTruncator.LimitOnWordBoundary("aap <span>noot</span> mies", 100, ellipses: "...");
Assert.AreEqual("aap <span>noot</span> mies", xml);
}
 
[Test]
public void StringWithoutTagsCountsSpaces()
{
var xml = HtmlTruncator.LimitOnWordBoundary("aap noot mies", 13, ellipses: "...");
Assert.AreEqual("aap noot mies", xml);
xml = HtmlTruncator.LimitOnWordBoundary("aap noot mies", 12, ellipses: "...");
Assert.AreEqual("aap noot...", xml);
}
 
[Test]
public void StringWithSplitInsideTag()
{
 
var xml = HtmlTruncator.LimitOnWordBoundary("<span>aap noot mies</span>", 6, ellipses: "...");
Assert.AreEqual("<span>aap</span>...", xml);
}
 
[Test]
public void StringWithNestedTags()
{
 
var xml = HtmlTruncator.LimitOnWordBoundary("<span><b>aap noot</b> mies</span>", 6, ellipses: "...");
Assert.AreEqual("<span><b>aap</b></span>...", xml);
}
 
[Test]
public void StringWithTwoTagsSeperatedBySpaceCountsSpaces()
{
var xml = HtmlTruncator.LimitOnWordBoundary("<span>aap</span> <span>noot</span> mies", 13, ellipses: "...");
Assert.AreEqual("<span>aap</span> <span>noot</span> mies", xml);
xml = HtmlTruncator.LimitOnWordBoundary("<span>aap</span> <span>noot</span> mies", 12, ellipses: "...");
Assert.AreEqual("<span>aap</span> <span>noot</span>...", xml);
}
 
[Test]
public void StringWithTwoTagsSplitAfterTheFirst()
{
var xml = HtmlTruncator.LimitOnWordBoundary("<span>aap</span> noot mies <span>wim</span>", 11, ellipses: "...");
Assert.AreEqual("<span>aap</span> noot...", xml);
}
 
 
[Test]
public void StringWithoutTagsShouldBreakCorrectly()
{
var xml = HtmlTruncator.LimitOnWordBoundary("aap noot mies", 6, ellipses: "...");
Assert.AreEqual("aap...", xml);
}
 
[Test]
public void LongStringShouldJustTruncate()
{
var xml = HtmlTruncator.LimitOnWordBoundary("aapnootmies", 6, ellipses: "...");
Assert.AreEqual("aapnoo...", xml);
}
 
[Test]
public void LongStringWithoutSpaceAfterTagWithEnoughBeforeShouldIgnore()
{
var xml = HtmlTruncator.LimitOnWordBoundary("<span>aap noot</span>aapnootmies", 9, ellipses: "...");
Assert.AreEqual("<span>aap noot</span>...", xml);
}
 
[Test]
public void LongStringWithSpaceAfterTagWithEnoughBeforeShouldIgnore()
{
var xml = HtmlTruncator.LimitOnWordBoundary("<span>aap noot</span> aapnootmies", 9, ellipses: "...");
Assert.AreEqual("<span>aap noot</span>...", xml);
}
[Test]
public void LongStringAfterTagWithJustOneCharBeforeShouldBreak()
{
var xml = HtmlTruncator.LimitOnWordBoundary("<span>a</span> aapnootmies", 9, ellipses: "...");
Assert.AreEqual("<span>a</span> aapnoot...", xml);
}
 
}
description
1 2 3 4
This class truncates a string to a certain amount of chars, but truncates only on wordboundaries if possible.
It can handle html tags (which should be proper xml tags, so <br> will not work, should be <br/>, you can use SgmlReader if this would be a problem), and closes them correctly.
 
I personally think this is better than regex versions of this which I have seen.

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.