Created
June 24, 2015 17:05
-
-
Save mrubinsk/83b4d8fd0ffbf8ef9f6d to your computer and use it in GitHub Desktop.
Text_Filter_xss
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* This filter attempts to make HTML safe for viewing. IT IS NOT PERFECT. If | |
* you enable HTML viewing, you are opening a security hole. With the current | |
* state of the web, I believe that the best we can do is to make sure that | |
* people *KNOW* HTML is a security hole, clean up what we can, and leave it | |
* at that. | |
* | |
* $Horde: framework/Text_Filter/Filter/xss.php,v 1.18 2008/12/10 15:11:58 chuck Exp $ | |
* | |
* Copyright 2004-2008 The Horde Project (http://www.horde.org/) | |
* | |
* See the enclosed file COPYING for license information (LGPL). If you | |
* did not receive this file, see http://www.fsf.org/copyleft/lgpl.html. | |
* | |
* @author Jan Schneider <jan@horde.org> | |
* @since Horde 3.1 | |
* @package Horde_Text | |
*/ | |
class Text_Filter_xss extends Text_Filter { | |
/** | |
* Filter parameters. | |
* | |
* @var array | |
*/ | |
var $_params = array('body_only' => true, | |
'replace' => 'XSSCleaned', | |
'strip_styles' => true, | |
'strip_style_attributes' => true); | |
/** | |
* Returns a hash with replace patterns. | |
* | |
* @return array Patterns hash. | |
*/ | |
function getPatterns() | |
{ | |
$patterns = array(); | |
/* Remove all control characters. */ | |
$patterns['/[\x00-\x08\x0e-\x1f]/'] = ''; | |
/* Removes HTML comments (including some scripts & styles). */ | |
if ($this->_params['strip_styles']) { | |
$patterns['/<!--.*?-->/s'] = ''; | |
} | |
/* Change space entities to space characters. */ | |
$patterns['/&#(?:x0*20|0*32);?/i'] = ' '; | |
/* If we have a semicolon, it is deterministically detectable and | |
* fixable, without introducing collateral damage. */ | |
$patterns['/&#x?0*(?:[9A-D]|1[0-3]);/i'] = ' '; | |
/* Hex numbers (usually having an x prefix) are also deterministic, | |
* even if we don't have the semi. Note that some browsers will treat | |
* &#a or �a as a hex number even without the x prefix; hence /x?/ | |
* which will cover those cases in this rule. */ | |
$patterns['/&#x?0*[9A-D]([^0-9A-F]|$)/i'] = ' \\1'; | |
/* Decimal numbers without trailing semicolons. The problem is that | |
* some browsers will interpret 
a as "\na", some as "Ċ" so we | |
* have to clean the 
 to be safe for the "\na" case at the expense | |
* of mangling a valid entity in other cases. (Solution for valid HTML | |
* authors: always use the semicolon.) */ | |
$patterns['/�*(?:9|1[0-3])([^0-9]|$)/i'] = ' \\1'; | |
/* Remove overly long numeric entities. */ | |
$patterns['/&#x?0*[0-9A-F]{6,};?/i'] = ' '; | |
/* Remove everything outside of and including the <html> and <body> | |
* tags. */ | |
if ($this->_params['body_only']) { | |
$patterns['/^.*<(?:body|html)[^>]*>/si'] = ''; | |
$patterns['/<\/(?:body|html)>.*$/si'] = ''; | |
} | |
/* Get all attribute="javascript:foo()" tags. This is essentially the | |
* regex /(=|url\()("?)[^>]*script:/ but expanded to catch camouflage | |
* with spaces and entities. */ | |
$preg = '/((=|�*61;?|�*3D;?)|' . | |
'((u|�*85;?|�*55;?|�*117;?|�*75;?|\\\\0*75)\s*' . | |
'(r|�*82;?|�*52;?|�*114;?|�*72;?|\\\\0*72)\s*' . | |
'(l|�*76;?|�*4c;?|�*108;?|�*6c;?|\\\\0*6c)\s*' . | |
'(\(|\\\\0*28)))\s*' . | |
'(\'|�*34;?|�*22;?|"|�*39;?|�*27;?)?' . | |
'[^>]*\s*' . | |
'(s|�*83;?|�*53;?|�*115;?|�*73;?|\\\\0*73)\s*' . | |
'(c|�*67;?|�*43;?|�*99;?|�*63;?|\\\\0*63)\s*' . | |
'(r|�*82;?|�*52;?|�*114;?|�*72;?|\\\\0*72)\s*' . | |
'(i|�*73;?|�*49;?|�*105;?|�*69;?|\\\\0*69)\s*' . | |
'(p|�*80;?|�*50;?|�*112;?|�*70;?|\\\\0*70)\s*' . | |
'(t|�*84;?|�*54;?|�*116;?|�*74;?|\\\\0*74)\s*' . | |
'(:|�*58;?|�*3a;?|\\\\0*3a)/i'; | |
$patterns[$preg] = '\1\8' . $this->_params['replace']; | |
/* Get all on<foo>="bar()". NEVER allow these. */ | |
$patterns['/([\s"\'\/]+' . | |
'(o|�*79;?|�*4f;?|�*111;?|�*6f;?)' . | |
'(n|�*78;?|�*4e;?|�*110;?|�*6e;?)' . | |
'\w+)[^=a-z0-9"\'>]*=/i'] = '\1' . $this->_params['replace'] . '='; | |
/* Remove all scripts since they might introduce garbage if they are | |
* not quoted properly. */ | |
$patterns['|<script[^>]*>.*?</script>|is'] = '<' . $this->_params['replace'] . '_script />'; | |
/* Get all tags that might cause trouble - <object>, <embed>, | |
* <applet>, <base>, etc. Meta refreshes and iframes, too. */ | |
$malicious = array( | |
'/<([^>a-z]*)' . | |
'(s|�*83;?|�*53;?|�*115;?|�*73;?)\s*' . | |
'(c|�*67;?|�*43;?|�*99;?|�*63;?)\s*' . | |
'(r|�*82;?|�*52;?|�*114;?|�*72;?)\s*' . | |
'(i|�*73;?|�*49;?|�*105;?|�*69;?)\s*' . | |
'(p|�*80;?|�*50;?|�*112;?|�*70;?)\s*' . | |
'(t|�*84;?|�*54;?|�*116;?|�*74;?)\s*/i', | |
'/<([^>a-z]*)' . | |
'(e|�*69;?|�*45;?|�*101;?|�*65;?)\s*' . | |
'(m|�*77;?|�*4d;?|�*109;?|�*6d;?)\s*' . | |
'(b|�*66;?|�*42;?|�*98;?|�*62;?)\s*' . | |
'(e|�*69;?|�*45;?|�*101;?|�*65;?)\s*' . | |
'(d|�*68;?|�*44;?|�*100;?|�*64;?)\s*/i', | |
'/<([^>a-z]*)' . | |
'(x|�*88;?|�*58;?|�*120;?|�*78;?)\s*' . | |
'(m|�*77;?|�*4d;?|�*109;?|�*6d;?)\s*' . | |
'(l|�*76;?|�*4c;?|�*108;?|�*6c;?)\s*/i', | |
'/<([^>a-z]*)\?([^>a-z]*)' . | |
'(i|�*73;?|�*49;?|�*105;?|�*69;?)\s*' . | |
'(m|�*77;?|�*4d;?|�*109;?|�*6d;?)\s*' . | |
'(p|�*80;?|�*50;?|�*112;?|�*70;?)\s*' . | |
'(o|�*79;?|�*4f;?|�*111;?|�*6f;?)\s*' . | |
'(r|�*82;?|�*52;?|�*114;?|�*72;?)\s*' . | |
'(t|�*84;?|�*54;?|�*116;?|�*74;?)\s*/i', | |
'/<([^>a-z]*)' . | |
'(b|�*66;?|�*42;?|�*98;?|�*62;?)\s*' . | |
'(a|�*65;?|�*41;?|�*97;?|�*61;?)\s*' . | |
'(s|�*83;?|�*53;?|�*115;?|�*73;?)\s*' . | |
'(e|�*69;?|�*45;?|�*101;?|�*65;?)\s*' . | |
'[^line]/i', | |
'/<([^>a-z]*)' . | |
'(m|�*77;?|�*4d;?|�*109;?|�*6d;?)\s*' . | |
'(e|�*69;?|�*45;?|�*101;?|�*65;?)\s*' . | |
'(t|�*84;?|�*54;?|�*116;?|�*74;?)\s*' . | |
'(a|�*65;?|�*41;?|�*97;?|�*61;?)\s*/i', | |
'/<([^>a-z]*)' . | |
'(j|�*74;?|�*4a;?|�*106;?|�*6a;?)\s*' . | |
'(a|�*65;?|�*41;?|�*97;?|�*61;?)\s*' . | |
'(v|�*86;?|�*56;?|�*118;?|�*76;?)\s*' . | |
'(a|�*65;?|�*41;?|�*97;?|�*61;?)\s*/i', | |
'/<([^>a-z]*)' . | |
'(o|�*79;?|�*4f;?|�*111;?|�*6f;?)\s*' . | |
'(b|�*66;?|�*42;?|�*98;?|�*62;?)\s*' . | |
'(j|�*74;?|�*4a;?|�*106;?|�*6a;?)\s*' . | |
'(e|�*69;?|�*45;?|�*101;?|�*65;?)\s*' . | |
'(c|�*67;?|�*43;?|�*99;?|�*63;?)\s*' . | |
'(t|�*84;?|�*54;?|�*116;?|�*74;?)\s*/i', | |
'/<([^>a-z]*)' . | |
'(a|�*65;?|�*41;?|�*97;?|�*61;?)\s*' . | |
'(p|�*80;?|�*50;?|�*112;?|�*70;?)\s*' . | |
'(p|�*80;?|�*50;?|�*112;?|�*70;?)\s*' . | |
'(l|�*76;?|�*4c;?|�*108;?|�*6c;?)\s*' . | |
'(e|�*69;?|�*45;?|�*101;?|�*65;?)\s*' . | |
'(t|�*84;?|�*54;?|�*116;?|�*74;?)\s*/i', | |
'/<([^>a-z]*)' . | |
'(l|�*76;?|�*4c;?|�*108;?|�*6c;?)\s*' . | |
'(a|�*65;?|�*41;?|�*97;?|�*61;?)\s*' . | |
'(y|�*89;?|�*59;?|�*121;?|�*79;?)\s*' . | |
'(e|�*69;?|�*45;?|�*101;?|�*65;?)\s*' . | |
'(r|�*82;?|�*52;?|�*114;?|�*72;?)\s*/i', | |
'/<([^>a-z]*)' . | |
'(i|�*73;?|�*49;?|�*105;?|�*69;?)?\s*' . | |
'(f|�*70;?|�*46;?|�*102;?|�*66;?)\s*' . | |
'(r|�*82;?|�*52;?|�*114;?|�*72;?)\s*' . | |
'(a|�*65;?|�*41;?|�*97;?|�*61;?)\s*' . | |
'(m|�*77;?|�*4d;?|�*109;?|�*6d;?)\s*' . | |
'(e|�*69;?|�*45;?|�*101;?|�*65;?)\s*/i'); | |
foreach ($malicious as $pattern) { | |
$patterns[$pattern] = '<' . $this->_params['replace'] . '_tag'; | |
} | |
/* Comment out style/link tags. */ | |
if ($this->_params['strip_styles']) { | |
if ($this->_params['strip_style_attributes']) { | |
$patterns['/(\s+|([\'"]))style\s*=/i'] = '$2 ' . $this->_params['replace'] . '='; | |
} | |
$patterns['|<style[^>]*>(?:\s*<\!--)*|i'] = '<!--'; | |
$patterns['|(?:-->\s*)*</style>|i'] = '-->'; | |
$patterns['|(<link[^>]*>)|i'] = '<!-- $1 -->'; | |
} | |
/* A few other matches. */ | |
$patterns['|<([^>]*)&{.*}([^>]*)>|'] = '<\1&{;}\2>'; | |
$patterns['|<([^>]*)mocha:([^>]*)>|i'] = '<\1' . $this->_params['replace'] . ':\2>'; | |
$patterns['/<(([^>]*)|(style[^>]*>[^<]*))binding:((?(3)[^<]*<\/style)[^>]*)>/i'] = '<\1' . $this->_params['replace'] . ':\4>'; | |
return array('regexp' => $patterns); | |
} | |
/** | |
* Executes any code necessary before applying the filter patterns. | |
* | |
* @param string $text The text before the filtering. | |
* | |
* @return string The modified text. | |
*/ | |
function preProcess($text) | |
{ | |
// As of PHP 5.2, backtrack limits have been set to an unreasonably | |
// low number. The body check will often times trigger backtrack | |
// errors so up the backtrack limit if we are doing this match. | |
if ($this->_params['body_only'] && ini_get('pcre.backtrack_limit')) { | |
ini_set('pcre.backtrack_limit', 5000000); | |
} | |
return $text; | |
} | |
/** | |
* Executes any code necessary after applying the filter patterns. | |
* | |
* @param string $text The text after the filtering. | |
* | |
* @return string The modified text. | |
*/ | |
function postProcess($text) | |
{ | |
ini_restore('pcre.backtrack_limit'); | |
return $text; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment