-
-
Save dmsnell/11f7165f5c7cd3ca69e4b9d751ff093e to your computer and use it in GitHub Desktop.
Semantic-aware truncation of HTML with or without tags.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Returns a portion of an HTML document including up to a given number of code points. | |
* | |
* Example: | |
* | |
* 'Just <a href="#">a link</a>' = truncate_html( 'Just <a href="#">a link</a> to content.', 11 ); | |
* 'Just <a href="#">a l</a>' = truncate_html( 'Just <a href="#">a link</a> to content.', 8 ); | |
* 'Just' = truncate_html( 'Just <a href="#">a link</a> to content.', 4 ); | |
* | |
* @param string $document Input HTML document to truncate. | |
* @param int $codepoint_count Max code points in textContent of output. | |
* @return string Portion of HTML document containing up to max code points, including original HTML formatting. | |
*/ | |
function truncate_html( $document, $codepoint_count ) { | |
// Invalid calls. Should these raise an error somewhere? | |
if ( ! is_int( $codepoint_count ) || $codepoint_count < 1 ) { | |
return ''; | |
} | |
// If the document contains fewer bytes than are allowed as code points | |
// then it's safe to return the entire thing and avoid additional | |
// processing because byte counts are strictly >= code point counts. | |
if ( strlen( $document ) < $codepoint_count ) { | |
return $document; | |
} | |
// If the input cannot contain any HTML tags then it must all be | |
// plaintext and the mb_substr function will return the same result. | |
if ( ! str_contains( $document, '<' ) ) { | |
return mb_substr( html_entity_decode( $document, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ), 0, $codepoint_count ); | |
} | |
$text_length = 0; | |
$processor = new WP_HTML_Processor( $document ); | |
while ( $processor->next_token() ) { | |
if ( '#text' === $processor->get_node_name() ) { | |
$text_chunk = $processor->get_node_text(); | |
$chunk_length = mb_strlen( html_entity_decode( $text_chunk, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5 ) ); | |
if ( $text_length + $chunk_length >= $codepoint_count ) { | |
$processor->set_bookmark( 'here' ); | |
return $processor->serializeUntil( 'here' ); | |
} | |
$text_length += $chunk_length; | |
} | |
} | |
// By now the document fits within the requested limit, | |
// otherwise it would have returned already. | |
return $document; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment