Skip to content

Instantly share code, notes, and snippets.

@Scroller
Created January 7, 2012 17:51
Show Gist options
  • Save Scroller/1575452 to your computer and use it in GitHub Desktop.
Save Scroller/1575452 to your computer and use it in GitHub Desktop.
Sanitizing HTML 'parser'
<?php
class HtmlFilter{ //Version: 3.1
const OPENS = 1;
const CLOSES = 2;
const SINGLE = 3;
public $goodTagsRE = 'i|b|a|em|strong|hr|ul|ol|li|img';
public $goodAttrs = array('href', 'src', 'class', 'alt', 'title');
public $goodProtocolsRE = 'https?|s?ftp|mailto|torrent';
public $defaultProtocol = 'http://'; //prepended to URLs without above protocols
public $linkAttrs = array('href', 'src'); //these are checked for protocols
public $encoding = 'UTF-8'; //used for htmlspecialchars
function __construct($goodTags = NULL, $goodAttrs = NULL, $goodProtocols = NULL, $wrapLength = NULL){
if (isset($goodTags)) { $this->goodTagsRE = join('|', $goodTags); }
if (isset($goodAttrs)) { $this->goodAttrs = $goodAttrs; }
if (isset($goodProtocols)) { $this->goodProtocolsRE = join('|', $goodProtocols); }
if (isset($wrapLength)) { $this->wrapLength = $wrapLength; }
}
protected function escape($htmls){ //need for preg_replace_callback in clean
return htmlspecialchars($htmls[0], ENT_QUOTES, $this->encoding, FALSE);
}
function clean($html){
$html = preg_replace_callback('#(<!--.*?-->)#s', array($this, 'escape'), $html); //escape comments
//match approved tags in all possible forms
$tagOpenRE = "<(?:$this->goodTagsRE)(?:\s.*?)?/?>"; //opening or empty
$tagCloseRE = "</(?:$this->goodTagsRE)\s*?>"; //closing
$parts = preg_split("#($tagOpenRE|$tagCloseRE)#si", $html, -1, PREG_SPLIT_DELIM_CAPTURE);
$openTags = array(); //a stack to find mismatched tags
foreach ($parts as $key => &$part) {
if ($key % 2 == 0) { //even - text, possibly with bad tags
$part = htmlspecialchars($part, ENT_QUOTES, $this->encoding, FALSE);
} else { //odd - an aproved tag
$tagInfo = $this->parseTag($part);
$worthyTag = TRUE;
foreach ($tagInfo['attrs'] as $attrName => &$attrValue) {
if (!in_array($attrName, $this->goodAttrs)) {
$worthyTag = FALSE;
break;
}
if (in_array($attrName, $this->linkAttrs)) { //if attribute is a link, enforce a good protocol
if (!preg_match("#^$this->goodProtocolsRE:#i", $attrValue)) {
$attrValue = $this->defaultProtocol.$attrValue;
}
}
}
if ($tagInfo['type'] == self::OPENS) {
$openTags[] = array('name' => $tagInfo['name'], 'key' => $key);
} elseif ($tagInfo['type'] == self::CLOSES) {
$open = $openTags[sizeof($openTags) - 1];
if ($open['name'] == $tagInfo['name']) {
array_pop($openTags); //this tag closes an open one
} else {
$worthyTag = FALSE; //we expect another tag
}
}
if ($worthyTag) {
$part = $this->unparseTag($tagInfo);
} else {
$part = htmlspecialchars($part, ENT_QUOTES, $this->encoding, FALSE);
}
}
}
foreach ($openTags as $open) { //comment out all unclosed tags
$key = $open['key'];
$parts[$key] = htmlspecialchars($parts[$key], ENT_QUOTES, $this->encoding, FALSE);
}
return join($parts);
}
function unparseTag($tagInfo){ //escapes only attr content, the rest is handled in clean()
$attrs = '';
foreach ($tagInfo['attrs'] as $name => $value) {
$value = htmlspecialchars($value, ENT_QUOTES, $this->encoding, FALSE);
$attrs .= ' '.$name.'="'.$value.'"';
}
if ($tagInfo['type'] == self::OPENS) {
return "<$tagInfo[name]$attrs>";
} elseif ($tagInfo['type'] == self::CLOSES) {
return "</$tagInfo[name]>";
} else {
return "<$tagInfo[name]$attrs />";
}
}
function parseTag($tagStr){
$tagInfo = array('type' => self::OPENS, 'name' => '', 'attrs' => array());
if ($tagStr{1} == '/') {
$tagInfo['type'] = self::CLOSES;
} elseif ($tagStr{strlen($tagStr) - 2} == '/') {
$tagInfo['type'] = self::SINGLE;
}
$matches = array();
preg_match("#^</?([\w\d_:\-]+)#", $tagStr, $matches);
$tagInfo['name'] = $matches[1];
if ($tagInfo['type'] != self::CLOSES) {
$attrsRE = '#\s+([\w\d_:\-]+)\w*=\w*(["|\'])(.*?)\2#';
$offset = strlen($tagInfo['name']) + 1;
preg_match_all($attrsRE, $tagStr, $matches, PREG_SET_ORDER, $offset);
foreach ($matches as $match) {
$tagInfo['attrs'][ $match[1] ] = $match[3];
}
}
return $tagInfo;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment