Skip to content

Instantly share code, notes, and snippets.

@rdohms
Created June 30, 2015 09:32
Show Gist options
  • Save rdohms/73afce744f7e8b894174 to your computer and use it in GitHub Desktop.
Save rdohms/73afce744f7e8b894174 to your computer and use it in GitHub Desktop.
A simple html to email friendly text
<?php
namespace Symbid\Library\Bundle\MessagingBundle\Twig;
use Twig_Extension;
class TextTransformer extends Twig_Extension
{
/**
* Returns the name of the extension.
*
* @return string The extension name
*/
public function getName()
{
return 'symbid_messaging.twig.text_transformer';
}
public function getFilters()
{
return [
new \Twig_SimpleFilter('textSafe', [$this, 'makeTextSafe'])
];
}
public function makeTextSafe($content)
{
libxml_use_internal_errors(true);
// Apply rules
// Clean up
$content = $this->purifyHTML($content);
// a[href] => content (href)
$content = $this->linksToText($content);
// <p>text</p> => text <line break>
$content = $this->paragraphsToText($content);
// all else => strip
$content = strip_tags($content);
libxml_use_internal_errors(false);
return $content;
}
protected function linksToText($content)
{
$dom = new \DOMDocument();
$dom->loadHTML($content, LIBXML_NOERROR | LIBXML_NOENT);
$links = $dom->getElementsByTagName('a');
foreach ($links as $tag) {
/** @var \DOMElement $tag */
$href = $tag->getAttribute('href');
$output = (! empty($href))? "{$tag->textContent} ($href)" : "{$tag->textContent}";
$content = $this->replaceTagInContent($tag, $output, $content);
}
return trim($content);
}
/**
* Trim and Tidy's up HTML so that DOM Parser can handle it
*
* Allowing it to wrap with body/html allows it to properly handle segments of html with paragraphs without getting
* lost.
*
* @param string $content
* @return string
*/
public function purifyHTML($content)
{
$dom = new \DOMDocument();
$dom->loadHTML($content, LIBXML_HTML_NODEFDTD | LIBXML_NOENT);
return trim(preg_replace('/<html><body>(.*)<\/body><\/html>/', '$1', $dom->saveHTML()));
}
/**
* Replaces all paragraphs with text and line breaks.
*
* @param $content
* @return mixed
*/
public function paragraphsToText($content)
{
$dom = new \DOMDocument();
$dom->loadHTML($content, LIBXML_NOERROR | LIBXML_NOENT);
$paragraphs = $dom->getElementsByTagName('p');
foreach ($paragraphs as $tag) {
$output = "{$tag->textContent}\n";
$content = $this->replaceTagInContent($tag, $output, $content);
}
return trim($content);
}
/**
* @param \DOMElement $tag
* @param string $output
* @param string $content
* @return string
*/
protected function replaceTagInContent(\DOMElement $tag, $output, $content)
{
$tmpDoc = new \DOMDocument();
$tmpDoc->appendChild($tmpDoc->importNode($tag, true));
$htmlRepresentation = $tmpDoc->saveHTML();
return str_replace(trim($htmlRepresentation), $output, $content);
}
}
@rdohms
Copy link
Author

rdohms commented Jun 30, 2015

Obviously needed to point out here:

  • all HTML that feeds this is internally generated, so we have control of source
  • not security-aware, mostly focused on presentation
  • lib xml is flacky as hell.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment