Skip to content

Instantly share code, notes, and snippets.

@ufuk
Created October 6, 2021 13:13
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ufuk/badf3b18f099a54a20a6bb6e9f8d5f1a to your computer and use it in GitHub Desktop.
Save ufuk/badf3b18f099a54a20a6bb6e9f8d5f1a to your computer and use it in GitHub Desktop.
Utility method to convert HTML text to plain text while preserving newlines (using jsoup as main dependency)
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import java.util.List;
import java.util.stream.Collectors;
public final class HtmlToTextUtils {
public static String htmlToTextWhilePreservingNewlines(String htmlText) {
final Document document = Jsoup.parse(StringUtils.trimToEmpty(htmlText));
String plainText = buildTextFromNodeWhilePreservingNewlines(document.body()).toString().trim();
plainText = plainText.replaceAll(" +", " ");
plainText = plainText.lines().map(String::trim).collect(Collectors.joining("\n"));
plainText = plainText.replaceAll("\n{2,}", "\n\n");
return plainText;
}
private static StringBuilder buildTextFromNodeWhilePreservingNewlines(Node node) {
final StringBuilder stringBuilder = new StringBuilder();
if (node instanceof TextNode) {
TextNode textNode = (TextNode) node;
stringBuilder.append(textNode.text());
}
for (Node childNode : node.childNodes()) {
stringBuilder.append(buildTextFromNodeWhilePreservingNewlines(childNode));
}
if (node instanceof Element) {
final String tagName = ((Element) node).tagName();
if (List.of("p", "br", "li", "h1", "h2", "h3", "h4", "h5", "h6").contains(tagName)) {
stringBuilder.append("\n");
}
}
return stringBuilder;
}
}
import org.junit.jupiter.api.Test;
import static org.assertj.core.api.Assertions.assertThat;
class HtmlToTextUtilsTests {
@Test
void shouldCleanHtmlToTextWhilePreservingNewlines() {
final String html = "<h5>Description</h5><ul><li>%100 cotton</li><li>Flat pattern</li></ul><p><br></p><p><br></p><h5>Model Information</h5><p><strong>Chest:</strong> 90cm <strong>Waist:</strong> 60cm <strong>Width:</strong> 90cm <strong>Length:</strong> 180cm </p><p> The model is wearing <strong>L</strong> sized product.</p>";
assertThat(HtmlToTextUtils.htmlToTextWhilePreservingNewlines(html))
.isEqualTo("Description\n" +
"%100 cotton\n" +
"Flat pattern\n" +
"\n" +
"Model Information\n" +
"Chest: 90cm Waist: 60cm Width: 90cm Length: 180cm\n" +
"The model is wearing L sized product.");
}
@Test
void shouldCleanHtmlToTextWhilePreservingNewlinesWhenNullHtml() {
final String html = null;
assertThat(HtmlToTextUtils.htmlToTextWhilePreservingNewlines(html)).isEqualTo("");
}
@Test
void shouldCleanHtmlToTextWhilePreservingNewlinesWhenEmptyHtml() {
final String html = "";
assertThat(HtmlToTextUtils.htmlToTextWhilePreservingNewlines(html)).isEqualTo("");
}
@Test
void shouldCleanHtmlToTextWhilePreservingNewlinesWhenBlankHtml() {
final String html = " ";
assertThat(HtmlToTextUtils.htmlToTextWhilePreservingNewlines(html)).isEqualTo("");
}
@Test
void shouldCleanHtmlToTextWhilePreservingNewlinesWhenInvalidHtml() {
final String html = "<p>invalid<br>tags<p>";
assertThat(HtmlToTextUtils.htmlToTextWhilePreservingNewlines(html))
.isEqualTo("invalid\n" +
"tags");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment