Created
October 10, 2019 15:38
-
-
Save barryhughes/e2b1d374eaa2ec434b2e914954f8335c to your computer and use it in GitHub Desktop.
Inelegant but handy and regex-free way of swapping out HTML links with a plain text equivalent.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Given a body of text, transforms HTML links to a comprehensible | |
* plain text alternative. | |
* | |
* For example, given: | |
* | |
* Today <a href="http://foo.bar">this important article</a> was published | |
* about <a class="baz" href="ftp://bar.baz" target="xyz">Jim</a>. Go read | |
* it now! | |
* | |
* The output will be: | |
* | |
* Today this important article (http://foo.bar) was published about | |
* Jim (ftp://bar.bar). Go read it now! | |
* | |
* The examples are simplified for readability, so it's worth noting that | |
* whitespace is not impacted and no other tags will be removed or modified | |
* (though it could certainly be combined with the native strip_tags() | |
* function). | |
* | |
* How memory and time efficient is this code? Probably not very. It's a | |
* quick(-ly hashed out), dirty, regex-free way of doing things that works | |
* in some % of cases ☺ | |
* | |
* @param string $source | |
* | |
* @return string | |
*/ | |
function transform_links_to_plain_text( string $source ): string { | |
$output = $source; | |
$source_length = strlen( $source ); | |
$links = []; | |
// Marker variables | |
$start_of_link = 0; | |
$start_of_href = 0; | |
$end_of_href = 0; | |
$end_of_opening_link_tag = 0; | |
$start_of_closing_link_tag = 0; | |
// Scan the string, character by character | |
for ( $position = 0; $position < $source_length; $position++ ) { | |
// Found the start of a link? | |
if ( '<a' === substr( $source, $position, 2 ) ) { | |
$start_of_link = $position; | |
} | |
// Found the start of the href attribute? | |
if ( $start_of_link && 'href="' === substr( $source, $position, 6 ) ) { | |
$start_of_href = $position; | |
} | |
// Found the end of the href attribute? | |
if ( ! $end_of_href && $start_of_href && $position > ( $start_of_href + 6 ) && '"' === $source[ $position ] ) { | |
$end_of_href = $position; | |
} | |
// Found the end of the opening tag? | |
if ( $end_of_href && '>' === $source[ $position ] ) { | |
$end_of_opening_link_tag = $position; | |
} | |
// Found the start of the closing tag? | |
if ( $end_of_opening_link_tag && '<' === $source[ $position ] ) { | |
$start_of_closing_link_tag = $position; | |
// Capture the link and various bits of data about it | |
$link = (object) [ | |
'from' => $start_of_link, | |
'text' => substr( $source, $end_of_opening_link_tag + 1, $start_of_closing_link_tag - $end_of_opening_link_tag - 1 ), | |
'to' => $start_of_closing_link_tag + 3, | |
'url' => substr( $source, $start_of_href + 6, $end_of_href - $start_of_href - 6 ), | |
]; | |
$link->original = substr( $source, $link->from, $link->to - $link->from + 1 ); | |
$links[] = $link; | |
// Reset markers | |
$start_of_link = 0; | |
$start_of_href = 0; | |
$end_of_href = 0; | |
$end_of_opening_link_tag = 0; | |
$start_of_closing_link_tag = 0; | |
} | |
} | |
// Do replacements | |
foreach ( $links as $link ) { | |
$replacement = "{$link->text} ({$link->url})"; | |
$output = str_replace( $link->original, $replacement, $output ); | |
} | |
return $output; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment