Created
January 7, 2012 17:51
-
-
Save Scroller/1575452 to your computer and use it in GitHub Desktop.
Sanitizing HTML 'parser'
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class HtmlFilter{ //Version: 3.1 | |
const OPENS = 1; | |
const CLOSES = 2; | |
const SINGLE = 3; | |
public $goodTagsRE = 'i|b|a|em|strong|hr|ul|ol|li|img'; | |
public $goodAttrs = array('href', 'src', 'class', 'alt', 'title'); | |
public $goodProtocolsRE = 'https?|s?ftp|mailto|torrent'; | |
public $defaultProtocol = 'http://'; //prepended to URLs without above protocols | |
public $linkAttrs = array('href', 'src'); //these are checked for protocols | |
public $encoding = 'UTF-8'; //used for htmlspecialchars | |
function __construct($goodTags = NULL, $goodAttrs = NULL, $goodProtocols = NULL, $wrapLength = NULL){ | |
if (isset($goodTags)) { $this->goodTagsRE = join('|', $goodTags); } | |
if (isset($goodAttrs)) { $this->goodAttrs = $goodAttrs; } | |
if (isset($goodProtocols)) { $this->goodProtocolsRE = join('|', $goodProtocols); } | |
if (isset($wrapLength)) { $this->wrapLength = $wrapLength; } | |
} | |
protected function escape($htmls){ //need for preg_replace_callback in clean | |
return htmlspecialchars($htmls[0], ENT_QUOTES, $this->encoding, FALSE); | |
} | |
function clean($html){ | |
$html = preg_replace_callback('#(<!--.*?-->)#s', array($this, 'escape'), $html); //escape comments | |
//match approved tags in all possible forms | |
$tagOpenRE = "<(?:$this->goodTagsRE)(?:\s.*?)?/?>"; //opening or empty | |
$tagCloseRE = "</(?:$this->goodTagsRE)\s*?>"; //closing | |
$parts = preg_split("#($tagOpenRE|$tagCloseRE)#si", $html, -1, PREG_SPLIT_DELIM_CAPTURE); | |
$openTags = array(); //a stack to find mismatched tags | |
foreach ($parts as $key => &$part) { | |
if ($key % 2 == 0) { //even - text, possibly with bad tags | |
$part = htmlspecialchars($part, ENT_QUOTES, $this->encoding, FALSE); | |
} else { //odd - an aproved tag | |
$tagInfo = $this->parseTag($part); | |
$worthyTag = TRUE; | |
foreach ($tagInfo['attrs'] as $attrName => &$attrValue) { | |
if (!in_array($attrName, $this->goodAttrs)) { | |
$worthyTag = FALSE; | |
break; | |
} | |
if (in_array($attrName, $this->linkAttrs)) { //if attribute is a link, enforce a good protocol | |
if (!preg_match("#^$this->goodProtocolsRE:#i", $attrValue)) { | |
$attrValue = $this->defaultProtocol.$attrValue; | |
} | |
} | |
} | |
if ($tagInfo['type'] == self::OPENS) { | |
$openTags[] = array('name' => $tagInfo['name'], 'key' => $key); | |
} elseif ($tagInfo['type'] == self::CLOSES) { | |
$open = $openTags[sizeof($openTags) - 1]; | |
if ($open['name'] == $tagInfo['name']) { | |
array_pop($openTags); //this tag closes an open one | |
} else { | |
$worthyTag = FALSE; //we expect another tag | |
} | |
} | |
if ($worthyTag) { | |
$part = $this->unparseTag($tagInfo); | |
} else { | |
$part = htmlspecialchars($part, ENT_QUOTES, $this->encoding, FALSE); | |
} | |
} | |
} | |
foreach ($openTags as $open) { //comment out all unclosed tags | |
$key = $open['key']; | |
$parts[$key] = htmlspecialchars($parts[$key], ENT_QUOTES, $this->encoding, FALSE); | |
} | |
return join($parts); | |
} | |
function unparseTag($tagInfo){ //escapes only attr content, the rest is handled in clean() | |
$attrs = ''; | |
foreach ($tagInfo['attrs'] as $name => $value) { | |
$value = htmlspecialchars($value, ENT_QUOTES, $this->encoding, FALSE); | |
$attrs .= ' '.$name.'="'.$value.'"'; | |
} | |
if ($tagInfo['type'] == self::OPENS) { | |
return "<$tagInfo[name]$attrs>"; | |
} elseif ($tagInfo['type'] == self::CLOSES) { | |
return "</$tagInfo[name]>"; | |
} else { | |
return "<$tagInfo[name]$attrs />"; | |
} | |
} | |
function parseTag($tagStr){ | |
$tagInfo = array('type' => self::OPENS, 'name' => '', 'attrs' => array()); | |
if ($tagStr{1} == '/') { | |
$tagInfo['type'] = self::CLOSES; | |
} elseif ($tagStr{strlen($tagStr) - 2} == '/') { | |
$tagInfo['type'] = self::SINGLE; | |
} | |
$matches = array(); | |
preg_match("#^</?([\w\d_:\-]+)#", $tagStr, $matches); | |
$tagInfo['name'] = $matches[1]; | |
if ($tagInfo['type'] != self::CLOSES) { | |
$attrsRE = '#\s+([\w\d_:\-]+)\w*=\w*(["|\'])(.*?)\2#'; | |
$offset = strlen($tagInfo['name']) + 1; | |
preg_match_all($attrsRE, $tagStr, $matches, PREG_SET_ORDER, $offset); | |
foreach ($matches as $match) { | |
$tagInfo['attrs'][ $match[1] ] = $match[3]; | |
} | |
} | |
return $tagInfo; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment