Skip to content

Instantly share code, notes, and snippets.

@q42jaap
Created April 18, 2012 13:36
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save q42jaap/2413598 to your computer and use it in GitHub Desktop.
Save q42jaap/2413598 to your computer and use it in GitHub Desktop.
HtmlTruncator
This class truncates a string to a certain amount of chars, but truncates only on wordboundaries if possible.
It can handle html tags (which should be proper xml tags, so <br> will not work, should be <br/>, you can use SgmlReader if this would be a problem), and closes them correctly.
I personally think this is better than regex versions of this which I have seen.
public static class HtmlTruncator
{
public static string LimitOnWordBoundary(string str, int maxLength, string ellipses = "...")
{
XmlDocument doc = new XmlDocument();
XmlParserContext context = new XmlParserContext(doc.NameTable, new XmlNamespaceManager(doc.NameTable), null, XmlSpace.Preserve);
XmlTextReader reader = new XmlTextReader("<xml>" + str + "</xml>", XmlNodeType.Document, context);
bool shouldWriteEllipses;
using (var writer = doc.CreateNavigator().AppendChild())
{
LimitOnWordBoundary(writer, reader, maxLength, out shouldWriteEllipses);
writer.Flush();
}
return doc.DocumentElement.InnerXml + (shouldWriteEllipses ? ellipses : "");
}
public static void LimitOnWordBoundary(XmlWriter writer, XmlReader reader, int maxLength, out bool shouldWriteEllipses)
{
if (reader == null)
{
throw new ArgumentNullException("reader");
}
if (writer == null)
{
throw new ArgumentNullException("writer");
}
int elementCount = 0;
int currentLength = 0;
shouldWriteEllipses = false;
int magicMinimumLength = Math.Min(5, (maxLength + 1) / 2);
int num = (reader.NodeType == XmlNodeType.None) ? -1 : reader.Depth;
do
{
bool done = false;
switch (reader.NodeType)
{
case XmlNodeType.Element:
elementCount++;
writer.WriteStartElement(reader.Prefix, reader.LocalName, reader.NamespaceURI);
writer.WriteAttributes(reader, false);
if (reader.IsEmptyElement)
{
elementCount--;
writer.WriteEndElement();
}
break;
case XmlNodeType.Text:
string value = reader.Value;
int strLen = value.Length;
if (currentLength + strLen > maxLength)
{
string almost = value.Substring(0, maxLength - currentLength + 1);
int lastSpace = almost.LastIndexOf(' ');
if (lastSpace < 0)
{
if (currentLength < magicMinimumLength)
{
value = value.Substring(0, maxLength - currentLength);
}
else
{
value = null;
}
}
else if (lastSpace + currentLength < magicMinimumLength)
{
value = value.Substring(0, maxLength - currentLength);
}
else
{
value = value.Substring(0, lastSpace);
}
shouldWriteEllipses = true;
done = true;
}
if (value != null)
{
writer.WriteString(value);
currentLength += value.Length;
}
break;
case XmlNodeType.Whitespace:
case XmlNodeType.SignificantWhitespace:
writer.WriteString(reader.Value);
currentLength += reader.Value.Length;
break;
case XmlNodeType.EndElement:
elementCount--;
writer.WriteFullEndElement();
break;
case XmlNodeType.CDATA:
//writer.WriteCData(reader.Value);
break;
case XmlNodeType.EntityReference:
writer.WriteEntityRef(reader.Name);
currentLength++;
break;
case XmlNodeType.ProcessingInstruction:
case XmlNodeType.XmlDeclaration:
//writer.WriteProcessingInstruction(reader.Name, reader.Value);
break;
case XmlNodeType.Comment:
//writer.WriteComment(reader.Value);
break;
case XmlNodeType.DocumentType:
//writer.WriteDocType(reader.Name, reader.GetAttribute("PUBLIC"), reader.GetAttribute("SYSTEM"), reader.Value);
break;
}
if (done) break;
}
while (reader.Read() && ((num < reader.Depth) || ((num == reader.Depth) && (reader.NodeType == XmlNodeType.EndElement))));
while (elementCount > 0)
{
writer.WriteFullEndElement();
elementCount--;
}
}
}
[TestFixture] // nunit
public class HtmlTruncatorTests
{
[Test]
public void StringWithoutTagsShouldNotBreakIfShorter()
{
var xml = HtmlTruncator.LimitOnWordBoundary("aap noot mies", 100, ellipses: "...");
Assert.AreEqual("aap noot mies", xml);
}
[Test]
public void StringWithTagsShouldNotBreakIfShorter()
{
var xml = HtmlTruncator.LimitOnWordBoundary("aap <span>noot</span> mies", 100, ellipses: "...");
Assert.AreEqual("aap <span>noot</span> mies", xml);
}
[Test]
public void StringWithoutTagsCountsSpaces()
{
var xml = HtmlTruncator.LimitOnWordBoundary("aap noot mies", 13, ellipses: "...");
Assert.AreEqual("aap noot mies", xml);
xml = HtmlTruncator.LimitOnWordBoundary("aap noot mies", 12, ellipses: "...");
Assert.AreEqual("aap noot...", xml);
}
[Test]
public void StringWithSplitInsideTag()
{
var xml = HtmlTruncator.LimitOnWordBoundary("<span>aap noot mies</span>", 6, ellipses: "...");
Assert.AreEqual("<span>aap</span>...", xml);
}
[Test]
public void StringWithNestedTags()
{
var xml = HtmlTruncator.LimitOnWordBoundary("<span><b>aap noot</b> mies</span>", 6, ellipses: "...");
Assert.AreEqual("<span><b>aap</b></span>...", xml);
}
[Test]
public void StringWithTwoTagsSeperatedBySpaceCountsSpaces()
{
var xml = HtmlTruncator.LimitOnWordBoundary("<span>aap</span> <span>noot</span> mies", 13, ellipses: "...");
Assert.AreEqual("<span>aap</span> <span>noot</span> mies", xml);
xml = HtmlTruncator.LimitOnWordBoundary("<span>aap</span> <span>noot</span> mies", 12, ellipses: "...");
Assert.AreEqual("<span>aap</span> <span>noot</span>...", xml);
}
[Test]
public void StringWithTwoTagsSplitAfterTheFirst()
{
var xml = HtmlTruncator.LimitOnWordBoundary("<span>aap</span> noot mies <span>wim</span>", 11, ellipses: "...");
Assert.AreEqual("<span>aap</span> noot...", xml);
}
[Test]
public void StringWithoutTagsShouldBreakCorrectly()
{
var xml = HtmlTruncator.LimitOnWordBoundary("aap noot mies", 6, ellipses: "...");
Assert.AreEqual("aap...", xml);
}
[Test]
public void LongStringShouldJustTruncate()
{
var xml = HtmlTruncator.LimitOnWordBoundary("aapnootmies", 6, ellipses: "...");
Assert.AreEqual("aapnoo...", xml);
}
[Test]
public void LongStringWithoutSpaceAfterTagWithEnoughBeforeShouldIgnore()
{
var xml = HtmlTruncator.LimitOnWordBoundary("<span>aap noot</span>aapnootmies", 9, ellipses: "...");
Assert.AreEqual("<span>aap noot</span>...", xml);
}
[Test]
public void LongStringWithSpaceAfterTagWithEnoughBeforeShouldIgnore()
{
var xml = HtmlTruncator.LimitOnWordBoundary("<span>aap noot</span> aapnootmies", 9, ellipses: "...");
Assert.AreEqual("<span>aap noot</span>...", xml);
}
[Test]
public void LongStringAfterTagWithJustOneCharBeforeShouldBreak()
{
var xml = HtmlTruncator.LimitOnWordBoundary("<span>a</span> aapnootmies", 9, ellipses: "...");
Assert.AreEqual("<span>a</span> aapnoot...", xml);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment