barryhughes/plain-textify-html-links.php

## plain-textify-html-links.php
<?php
/**
 * Given a body of text, transforms HTML links to a comprehensible
 * plain text alternative.
 *
 * For example, given:
 *
 *     Today <a href="http://foo.bar">this important article</a> was published
 *     about <a class="baz" href="ftp://bar.baz" target="xyz">Jim</a>. Go read
 *     it now!
 *
 * The output will be:
 *
 *     Today this important article (http://foo.bar) was published about
 *     Jim (ftp://bar.bar). Go read it now!
 *
 * The examples are simplified for readability, so it's worth noting that
 * whitespace is not impacted and no other tags will be removed or modified
 * (though it could certainly be combined with the native strip_tags()
 * function).
 *
 * How memory and time efficient is this code? Probably not very. It's a
 * quick(-ly hashed out), dirty, regex-free way of doing things that works
 * in some % of cases ☺
 *
 * @param string $source
 *
 * @return string
 */
function transform_links_to_plain_text( string $source ): string {
	$output = $source;
	$source_length = strlen( $source );
	$links = [];

	// Marker variables
	$start_of_link = 0;
	$start_of_href = 0;
	$end_of_href = 0;
	$end_of_opening_link_tag = 0;
	$start_of_closing_link_tag = 0;

	// Scan the string, character by character
	for ( $position = 0; $position < $source_length; $position++ ) {
		// Found the start of a link?
		if ( '<a' === substr( $source, $position, 2 ) ) {
			$start_of_link = $position;
		}

		// Found the start of the href attribute?
		if ( $start_of_link && 'href="' === substr( $source, $position, 6 ) ) {
			$start_of_href = $position;
		}

		// Found the end of the href attribute?
		if ( ! $end_of_href && $start_of_href && $position > ( $start_of_href + 6 ) && '"' === $source[ $position ] ) {
			$end_of_href = $position;
		}

		// Found the end of the opening tag?
		if ( $end_of_href && '>' === $source[ $position ] ) {
			$end_of_opening_link_tag = $position;
		}

		// Found the start of the closing tag?
		if ( $end_of_opening_link_tag && '<' === $source[ $position ] ) {
			$start_of_closing_link_tag = $position;

			// Capture the link and various bits of data about it
			$link = (object) [
				'from' => $start_of_link,
				'text' => substr( $source, $end_of_opening_link_tag + 1, $start_of_closing_link_tag - $end_of_opening_link_tag - 1 ),
				'to'   => $start_of_closing_link_tag + 3,
				'url'  => substr( $source, $start_of_href + 6, $end_of_href - $start_of_href - 6 ),
			];

			$link->original = substr( $source, $link->from, $link->to - $link->from + 1 );
			$links[] = $link;

			// Reset markers
			$start_of_link = 0;
			$start_of_href = 0;
			$end_of_href = 0;
			$end_of_opening_link_tag = 0;
			$start_of_closing_link_tag = 0;
		}
	}

	// Do replacements
	foreach ( $links as $link ) {
		$replacement = "{$link->text} ({$link->url})";
		$output = str_replace( $link->original, $replacement, $output );
	}

	return $output;
}
	<?php
	/**
	* Given a body of text, transforms HTML links to a comprehensible
	* plain text alternative.
	*
	* For example, given:
	*
	* Today <a href="http://foo.bar">this important article</a> was published
	* about <a class="baz" href="ftp://bar.baz" target="xyz">Jim</a>. Go read
	* it now!
	*
	* The output will be:
	*
	* Today this important article (http://foo.bar) was published about
	* Jim (ftp://bar.bar). Go read it now!
	*
	* The examples are simplified for readability, so it's worth noting that
	* whitespace is not impacted and no other tags will be removed or modified
	* (though it could certainly be combined with the native strip_tags()
	* function).
	*
	* How memory and time efficient is this code? Probably not very. It's a
	* quick(-ly hashed out), dirty, regex-free way of doing things that works
	* in some % of cases ☺
	*
	* @param string $source
	*
	* @return string
	*/
	function transform_links_to_plain_text( string $source ): string {
	$output = $source;
	$source_length = strlen( $source );
	$links = [];

	// Marker variables
	$start_of_link = 0;
	$start_of_href = 0;
	$end_of_href = 0;
	$end_of_opening_link_tag = 0;
	$start_of_closing_link_tag = 0;

	// Scan the string, character by character
	for ( $position = 0; $position < $source_length; $position++ ) {
	// Found the start of a link?
	if ( '<a' === substr( $source, $position, 2 ) ) {
	$start_of_link = $position;
	}

	// Found the start of the href attribute?
	if ( $start_of_link && 'href="' === substr( $source, $position, 6 ) ) {
	$start_of_href = $position;
	}

	// Found the end of the href attribute?
	if ( ! $end_of_href && $start_of_href && $position > ( $start_of_href + 6 ) && '"' === $source[ $position ] ) {
	$end_of_href = $position;
	}

	// Found the end of the opening tag?
	if ( $end_of_href && '>' === $source[ $position ] ) {
	$end_of_opening_link_tag = $position;
	}

	// Found the start of the closing tag?
	if ( $end_of_opening_link_tag && '<' === $source[ $position ] ) {
	$start_of_closing_link_tag = $position;

	// Capture the link and various bits of data about it
	$link = (object) [
	'from' => $start_of_link,
	'text' => substr( $source, $end_of_opening_link_tag + 1, $start_of_closing_link_tag - $end_of_opening_link_tag - 1 ),
	'to' => $start_of_closing_link_tag + 3,
	'url' => substr( $source, $start_of_href + 6, $end_of_href - $start_of_href - 6 ),
	];

	$link->original = substr( $source, $link->from, $link->to - $link->from + 1 );
	$links[] = $link;

	// Reset markers
	$start_of_link = 0;
	$start_of_href = 0;
	$end_of_href = 0;
	$end_of_opening_link_tag = 0;
	$start_of_closing_link_tag = 0;
	}
	}

	// Do replacements
	foreach ( $links as $link ) {
	$replacement = "{$link->text} ({$link->url})";
	$output = str_replace( $link->original, $replacement, $output );
	}

	return $output;
	}