Skip to content

Instantly share code, notes, and snippets.

@xeoncross
Created March 8, 2013 17:28
Show Gist options
  • Star 14 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save xeoncross/5118201 to your computer and use it in GitHub Desktop.
Save xeoncross/5118201 to your computer and use it in GitHub Desktop.
Sanitize HTML using PHP and the DOMDocument
<?php
/**
* Clean HTML string removing all element attributes and elements which are
* not in the provided whitelist (but keeping their allowed children).
*
* @see https://github.com/alixaxel/phunction/blob/master/phunction/HTML.php
* @param string $html to clean
* @param array $whitelist
*/
function clean_html($html, array $whitelist)
{
libxml_use_internal_errors(true) AND libxml_clear_errors();
if (is_object($html)) {
if ($html->hasChildNodes()) {
foreach (range($html->childNodes->length - 1, 0) as $i) {
clean_html($html->childNodes->item($i), $whitelist);
}
}
if ( ! in_array($html->nodeName, $whitelist)) {
$fragment = $html->ownerDocument->createDocumentFragment();
while ($html->childNodes->length > 0) {
$fragment->appendChild($html->childNodes->item(0));
}
return $html->parentNode->replaceChild($fragment, $html);
}
while ($html->hasAttributes()) {
$html->removeAttributeNode($html->attributes->item(0));
}
} else if($dom = DOMDocument::loadHTML($html)) {
clean_html($dom->documentElement, $whitelist);
return preg_replace('~<(?:!DOCTYPE|/?(?:html|body))[^>]*>\s*~i', '', $dom->saveHTML());
}
}
<?php
$whitelist = array(
'#text',
'h3', 'h4', 'h5', 'h6',
'blockquote', 'q', 'p',
'pre', 'code', // Code
'ul', 'ol', 'li',
'b', 'em', 'i', 'u', 'strike', 'sup', 'sub',
// Notice what is remove ↓ since attributes are not allowed
//'a' => array('href', 'title'), 'img' => array('src', 'alt', 'title'),
);
$string = <<<END
<div id="hello">
Hello World!
<div>
<p><span>text</span> goes here</p>
<a href="javascript:alert(document.location);">XSS</a> and normal text.
<b style="width: expression(alert(document.location));">XSS</b> is bad.
</div>
<br>
</div>
END;
print clean_html($string, $whitelist);
/* Output:
Hello World!
<p>text goes here</p>
XSS and normal text.
<b>XSS</b> is bad.
*/
@u01jmg3
Copy link

u01jmg3 commented Nov 11, 2016

Could the final part of the function be rewritten not to require a regex?

e.g.

$html     = '';
$body     = $dom->getElementsByTagName('body');
$children = $body->item(0)->childNodes;
foreach($children as $child){
    $html .= $child->ownerDocument->saveHtml($child);
}

@Marcin-J-PL
Copy link

You saved my day.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment