Skip to content

Instantly share code, notes, and snippets.

@amad
Created February 8, 2013 13:03
Show Gist options
  • Save amad/4738933 to your computer and use it in GitHub Desktop.
Save amad/4738933 to your computer and use it in GitHub Desktop.
Purge is standalone version of well-known Codeigniter xss_clean. just copy and pasted from CI to make it work out of box. Purge::xss_clean($data);
<?php
Class Purge {
static $instance = false;
/**
* IP address of the current user
*
* @var string
*/
var $ip_address = FALSE;
/**
* user agent (web browser) being used by the current user
*
* @var string
*/
var $user_agent = FALSE;
/**
* List of all HTTP request headers
*
* @var array
*/
protected $headers = array();
/**
* List of never allowed strings
*
* @var array
* @access protected
*/
/**
* Random Hash for protecting URLs
*
* @var string
* @access protected
*/
static $_xss_hash = '';
static protected $_never_allowed_str = array(
'document.cookie' => '[removed]',
'document.write' => '[removed]',
'.parentNode' => '[removed]',
'.innerHTML' => '[removed]',
'window.location' => '[removed]',
'-moz-binding' => '[removed]',
'<!--' => '&lt;!--',
'-->' => '--&gt;',
'<![CDATA[' => '&lt;![CDATA[',
'<comment>' => '&lt;comment&gt;'
);
/* never allowed, regex replacement */
/**
* List of never allowed regex replacement
*
* @var array
* @access protected
*/
static protected $_never_allowed_regex = array(
'javascript\s*:',
'expression\s*(\(|&\#40;)', // CSS and IE
'vbscript\s*:', // IE, surprise!
'Redirect\s+302',
"([\"'])?data\s*:[^\\1]*?base64[^\\1]*?,[^\\1]*?\\1?"
);
public function &init()
{
if (self::$instance === false) {
self::$instance = new self();
}
return self::$instance;
}
/**
* Fetch from array
*
* This is a helper function to retrieve values from global arrays
*
* @param array $array
* @param string $index
* @return string
*/
private function _fetch_from_array(&$array, $index='')
{
if ( ! isset($array[$index]))
{
return FALSE;
}
return self::xss_clean($array[$index]);
}
/**
* Fetch an item from the GET array
*
* @param string $index
* @return string
*/
public function get($index = NULL)
{
// Check if a field has been provided
if ($index === NULL AND ! empty($_GET))
{
$get = array();
// loop through the full _GET array
foreach (array_keys($_GET) as $key)
{
$get[$key] = self::_fetch_from_array($_GET, $key);
}
return $get;
}
return self::_fetch_from_array($_GET, $index);
}
/**
* Fetch an item from the POST array
*
* @param string $string
* @return string
*/
public function post($index = NULL)
{
// Check if a field has been provided
if ($index === NULL AND ! empty($_POST))
{
$post = array();
// Loop through the full _POST array and return it
foreach (array_keys($_POST) as $key)
{
$post[$key] = self::_fetch_from_array($_POST, $key);
}
return $post;
}
return self::_fetch_from_array($_POST, $index);
}
/**
* Fetch an item from either the GET array or the POST
*
* @param string $index
* @return string
*/
public function get_post($index = '')
{
if ( ! isset($_POST[$index]) )
{
return self::get($index);
}
else
{
return self::post($index);
}
}
/**
* Fetch an item from the COOKIE array
*
* @param string $index
* @return string
*/
public function cookie($index = '')
{
return self::_fetch_from_array($_COOKIE, $index);
}
/**
* Fetch an item from the SERVER array
*
* @param string $index
* @return string
*/
public function server($index = '')
{
return self::_fetch_from_array($_SERVER, $index);
}
/**
* User Agent
*
* @return string
*/
public function user_agent()
{
if (self::$user_agent !== FALSE)
{
return self::$user_agent;
}
self::$user_agent = ( ! isset($_SERVER['HTTP_USER_AGENT'])) ? FALSE : $_SERVER['HTTP_USER_AGENT'];
return self::$user_agent;
}
/**
* Sanitize Globals
*
* This function does the following:
*
* Unsets $_GET data (if query strings are not enabled)
*
* Unsets all globals if register_globals is enabled
*
* Standardizes newline characters to \n
*
* @access private
* @return void
*/
function _sanitize_globals()
{
// Clean $_GET Data
if (is_array($_GET) AND count($_GET) > 0)
{
foreach ($_GET as $key => $val)
{
$_GET[self::_clean_input_keys($key)] = self::_clean_input_data($val);
}
}
// Clean $_POST Data
if (is_array($_POST) AND count($_POST) > 0)
{
foreach ($_POST as $key => $val)
{
$_POST[self::_clean_input_keys($key)] = self::_clean_input_data($val);
}
}
// Sanitize PHP_SELF
$_SERVER['PHP_SELF'] = strip_tags($_SERVER['PHP_SELF']);
}
/**
* XSS Clean
*
* Sanitizes data so that Cross Site Scripting Hacks can be
* prevented. This function does a fair amount of work but
* it is extremely thorough, designed to prevent even the
* most obscure XSS attempts. Nothing is ever 100% foolproof,
* of course, but I haven't been able to get anything passed
* the filter.
*
* Note: This function should only be used to deal with data
* upon submission. It's not something that should
* be used for general runtime processing.
*
* This function was based in part on some code and ideas I
* got from Bitflux: http://channel.bitflux.ch/wiki/XSS_Prevention
*
* To help develop this script I used this great list of
* vulnerabilities along with a few other hacks I've
* harvested from examining vulnerabilities in other programs:
* http://ha.ckers.org/xss.html
*
* @param mixed string or array
* @param bool
* @return string
*/
public function xss_clean($str, $is_image = FALSE)
{
if (self::$instance === false) {
self::init();
}
/*
* Is the string an array?
*
*/
if (is_array($str))
{
while (list($key) = each($str))
{
$str[$key] = self::xss_clean($str[$key]);
}
return $str;
}
/*
* Remove Invisible Characters
*/
$str = self::remove_invisible_characters($str);
// Validate Entities in URLs
$str = self::_validate_entities($str);
/*
* URL Decode
*
* Just in case stuff like this is submitted:
*
* <a href="http://%77%77%77%2E%67%6F%6F%67%6C%65%2E%63%6F%6D">Google</a>
*
* Note: Use rawurldecode() so it does not remove plus signs
*
*/
$str = rawurldecode($str);
/*
* Convert character entities to ASCII
*
* This permits our tests below to work reliably.
* We only convert entities that are within tags since
* these are the ones that will pose security problems.
*
*/
$str = preg_replace_callback("/[a-z]+=([\'\"]).*?\\1/si", array(self::$instance, '_convert_attribute'), $str);
$str = preg_replace_callback("/<\w+.*?(?=>|<|$)/si", array(self::$instance, '_decode_entity'), $str);
/*
* Remove Invisible Characters Again!
*/
$str = self::remove_invisible_characters($str);
/*
* Convert all tabs to spaces
*
* This prevents strings like this: ja vascript
* NOTE: we deal with spaces between characters later.
* NOTE: preg_replace was found to be amazingly slow here on
* large blocks of data, so we use str_replace.
*/
if (strpos($str, "\t") !== FALSE)
{
$str = str_replace("\t", ' ', $str);
}
/*
* Capture converted string for later comparison
*/
$converted_string = $str;
// Remove Strings that are never allowed
$str = self::_do_never_allowed($str);
/*
* Makes PHP tags safe
*
* Note: XML tags are inadvertently replaced too:
*
* <?xml
*
* But it doesn't seem to pose a problem.
*/
if ($is_image === TRUE)
{
// Images have a tendency to have the PHP short opening and
// closing tags every so often so we skip those and only
// do the long opening tags.
$str = preg_replace('/<\?(php)/i', "&lt;?\\1", $str);
}
else
{
$str = str_replace(array('<?', '?'.'>'), array('&lt;?', '?&gt;'), $str);
}
/*
* Compact any exploded words
*
* This corrects words like: j a v a s c r i p t
* These words are compacted back to their correct state.
*/
$words = array(
'javascript', 'expression', 'vbscript', 'script', 'base64',
'applet', 'alert', 'document', 'write', 'cookie', 'window'
);
foreach ($words as $word)
{
$temp = '';
for ($i = 0, $wordlen = strlen($word); $i < $wordlen; $i++)
{
$temp .= substr($word, $i, 1)."\s*";
}
// We only want to do this when it is followed by a non-word character
// That way valid stuff like "dealer to" does not become "dealerto"
$str = preg_replace_callback('#('.substr($temp, 0, -3).')(\W)#is', array(self::$instance, '_compact_exploded_words'), $str);
}
/*
* Remove disallowed Javascript in links or img tags
* We used to do some version comparisons and use of stripos for PHP5,
* but it is dog slow compared to these simplified non-capturing
* preg_match(), especially if the pattern exists in the string
*/
do
{
$original = $str;
if (preg_match("/<a/i", $str))
{
$str = preg_replace_callback("#<a\s+([^>]*?)(>|$)#si", array(self::$instance, '_js_link_removal'), $str);
}
if (preg_match("/<img/i", $str))
{
$str = preg_replace_callback("#<img\s+([^>]*?)(\s?/?>|$)#si", array(self::$instance, '_js_img_removal'), $str);
}
if (preg_match("/script/i", $str) OR preg_match("/xss/i", $str))
{
$str = preg_replace("#<(/*)(script|xss)(.*?)\>#si", '[removed]', $str);
}
}
while($original != $str);
unset($original);
// Remove evil attributes such as style, onclick and xmlns
$str = self::_remove_evil_attributes($str, $is_image);
/*
* Sanitize naughty HTML elements
*
* If a tag containing any of the words in the list
* below is found, the tag gets converted to entities.
*
* So this: <blink>
* Becomes: &lt;blink&gt;
*/
$naughty = 'alert|applet|audio|basefont|base|behavior|bgsound|blink|body|embed|expression|form|frameset|frame|head|html|ilayer|iframe|input|isindex|layer|link|meta|object|plaintext|style|script|textarea|title|video|xml|xss';
$str = preg_replace_callback('#<(/*\s*)('.$naughty.')([^><]*)([><]*)#is', array(self::$instance, '_sanitize_naughty_html'), $str);
/*
* Sanitize naughty scripting elements
*
* Similar to above, only instead of looking for
* tags it looks for PHP and JavaScript commands
* that are disallowed. Rather than removing the
* code, it simply converts the parenthesis to entities
* rendering the code un-executable.
*
* For example: eval('some code')
* Becomes: eval&#40;'some code'&#41;
*/
$str = preg_replace('#(alert|cmd|passthru|eval|exec|expression|system|fopen|fsockopen|file|file_get_contents|readfile|unlink)(\s*)\((.*?)\)#si', "\\1\\2&#40;\\3&#41;", $str);
// Final clean up
// This adds a bit of extra precaution in case
// something got through the above filters
$str = self::_do_never_allowed($str);
/*
* Images are Handled in a Special Way
* - Essentially, we want to know that after all of the character
* conversion is done whether any unwanted, likely XSS, code was found.
* If not, we return TRUE, as the image is clean.
* However, if the string post-conversion does not matched the
* string post-removal of XSS, then it fails, as there was unwanted XSS
* code found and removed/changed during processing.
*/
if ($is_image === TRUE)
{
return ($str == $converted_string) ? TRUE: FALSE;
}
return $str;
}
/**
* Random Hash for protecting URLs
*
* @return string
*/
public function xss_hash()
{
mt_srand();
self::$_xss_hash = md5(time() + mt_rand(0, 1999999999));
return self::$_xss_hash;
}
/**
* Remove Invisible Characters
*
* This prevents sandwiching null characters
* between ascii characters, like Java\0script.
*
* @access public
* @param string
* @return string
*/
public function remove_invisible_characters($str, $url_encoded = TRUE)
{
$non_displayables = array();
// every control character except newline (dec 10)
// carriage return (dec 13), and horizontal tab (dec 09)
if ($url_encoded)
{
$non_displayables[] = '/%0[0-8bcef]/'; // url encoded 00-08, 11, 12, 14, 15
$non_displayables[] = '/%1[0-9a-f]/'; // url encoded 16-31
}
$non_displayables[] = '/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+/S'; // 00-08, 11, 12, 14-31, 127
do
{
$str = preg_replace($non_displayables, '', $str, -1, $count);
}
while ($count);
return $str;
}
/**
* Validate URL entities
*
* Called by xss_clean()
*
* @param string
* @return string
*/
protected function _validate_entities($str)
{
/*
* Protect GET variables in URLs
*/
// 901119URL5918AMP18930PROTECT8198
$str = preg_replace('|\&([a-z\_0-9\-]+)\=([a-z\_0-9\-]+)|i', self::xss_hash()."\\1=\\2", $str);
/*
* Validate standard character entities
*
* Add a semicolon if missing. We do this to enable
* the conversion of entities to ASCII later.
*
*/
$str = preg_replace('#(&\#?[0-9a-z]{2,})([\x00-\x20])*;?#i', "\\1;\\2", $str);
/*
* Validate UTF16 two byte encoding (x00)
*
* Just as above, adds a semicolon if missing.
*
*/
$str = preg_replace('#(&\#x?)([0-9A-F]+);?#i',"\\1\\2;",$str);
/*
* Un-Protect GET variables in URLs
*/
$str = str_replace(self::xss_hash(), '&', $str);
return $str;
}
/**
* Attribute Conversion
*
* Used as a callback for XSS Clean
*
* @param array
* @return string
*/
protected function _convert_attribute($match)
{
return str_replace(array('>', '<', '\\'), array('&gt;', '&lt;', '\\\\'), $match[0]);
}
/**
* HTML Entity Decode Callback
*
* Used as a callback for XSS Clean
*
* @param array
* @return string
*/
protected function _decode_entity($match)
{
return self::entity_decode($match[0], 'UTF-8');
}
/**
* HTML Entities Decode
*
* This function is a replacement for html_entity_decode()
*
* The reason we are not using html_entity_decode() by itself is because
* while it is not technically correct to leave out the semicolon
* at the end of an entity most browsers will still interpret the entity
* correctly. html_entity_decode() does not convert entities without
* semicolons, so we are left with our own little solution here. Bummer.
*
* @param string
* @param string
* @return string
*/
public function entity_decode($str, $charset='UTF-8')
{
if (stristr($str, '&') === FALSE)
{
return $str;
}
$str = html_entity_decode($str, ENT_COMPAT, $charset);
$str = preg_replace('~&#x(0*[0-9a-f]{2,5})~ei', 'chr(hexdec("\\1"))', $str);
return preg_replace('~&#([0-9]{2,4})~e', 'chr(\\1)', $str);
}
/**
* Do Never Allowed
*
* A utility function for xss_clean()
*
* @param string
* @return string
*/
protected function _do_never_allowed($str)
{
$str = str_replace(array_keys(self::$_never_allowed_str), self::$_never_allowed_str, $str);
foreach (self::$_never_allowed_regex as $regex)
{
$str = preg_replace('#'.$regex.'#is', '[removed]', $str);
}
return $str;
}
/**
* Compact Exploded Words
*
* Callback function for xss_clean() to remove whitespace from
* things like j a v a s c r i p t
*
* @param type
* @return type
*/
protected function _compact_exploded_words($matches)
{
return preg_replace('/\s+/s', '', $matches[1]).$matches[2];
}
/**
* Sanitize Naughty HTML
*
* Callback function for xss_clean() to remove naughty HTML elements
*
* @param array
* @return string
*/
protected function _sanitize_naughty_html($matches)
{
// encode opening brace
$str = '&lt;'.$matches[1].$matches[2].$matches[3];
// encode captured opening or closing brace to prevent recursive vectors
$str .= str_replace(array('>', '<'), array('&gt;', '&lt;'),
$matches[4]);
return $str;
}
/**
* JS Link Removal
*
* Callback function for xss_clean() to sanitize links
* This limits the PCRE backtracks, making it more performance friendly
* and prevents PREG_BACKTRACK_LIMIT_ERROR from being triggered in
* PHP 5.2+ on link-heavy strings
*
* @param array
* @return string
*/
protected function _js_link_removal($match)
{
return str_replace(
$match[1],
preg_replace(
'#href=.*?(alert\(|alert&\#40;|javascript\:|livescript\:|mocha\:|charset\=|window\.|document\.|\.cookie|<script|<xss|data\s*:)#si',
'',
self::_filter_attributes(str_replace(array('<', '>'), '', $match[1]))
),
$match[0]
);
}
/**
* JS Image Removal
*
* Callback function for xss_clean() to sanitize image tags
* This limits the PCRE backtracks, making it more performance friendly
* and prevents PREG_BACKTRACK_LIMIT_ERROR from being triggered in
* PHP 5.2+ on image tag heavy strings
*
* @param array
* @return string
*/
protected function _js_img_removal($match)
{
return str_replace(
$match[1],
preg_replace(
'#src=.*?(alert\(|alert&\#40;|javascript\:|livescript\:|mocha\:|charset\=|window\.|document\.|\.cookie|<script|<xss|base64\s*,)#si',
'',
self::_filter_attributes(str_replace(array('<', '>'), '', $match[1]))
),
$match[0]
);
}
/**
* Filter Attributes
*
* Filters tag attributes for consistency and safety
*
* @param string
* @return string
*/
protected function _filter_attributes($str)
{
$out = '';
if (preg_match_all('#\s*[a-z\-]+\s*=\s*(\042|\047)([^\\1]*?)\\1#is', $str, $matches))
{
foreach ($matches[0] as $match)
{
$out .= preg_replace("#/\*.*?\*/#s", '', $match);
}
}
return $out;
}
/*
* Remove Evil HTML Attributes (like evenhandlers and style)
*
* It removes the evil attribute and either:
* - Everything up until a space
* For example, everything between the pipes:
* <a |style=document.write('hello');alert('world');| class=link>
* - Everything inside the quotes
* For example, everything between the pipes:
* <a |style="document.write('hello'); alert('world');"| class="link">
*
* @param string $str The string to check
* @param boolean $is_image TRUE if this is an image
* @return string The string with the evil attributes removed
*/
protected function _remove_evil_attributes($str, $is_image)
{
// All javascript event handlers (e.g. onload, onclick, onmouseover), style, and xmlns
$evil_attributes = array('on\w*', 'style', 'xmlns', 'formaction');
if ($is_image === TRUE)
{
/*
* Adobe Photoshop puts XML metadata into JFIF images,
* including namespacing, so we have to allow this for images.
*/
unset($evil_attributes[array_search('xmlns', $evil_attributes)]);
}
do {
$count = 0;
$attribs = array();
// find occurrences of illegal attribute strings without quotes
preg_match_all('/('.implode('|', $evil_attributes).')\s*=\s*([^\s>]*)/is', $str, $matches, PREG_SET_ORDER);
foreach ($matches as $attr)
{
$attribs[] = preg_quote($attr[0], '/');
}
// find occurrences of illegal attribute strings with quotes (042 and 047 are octal quotes)
preg_match_all("/(".implode('|', $evil_attributes).")\s*=\s*(\042|\047)([^\\2]*?)(\\2)/is", $str, $matches, PREG_SET_ORDER);
foreach ($matches as $attr)
{
$attribs[] = preg_quote($attr[0], '/');
}
// replace illegal attribute strings that are inside an html tag
if (count($attribs) > 0)
{
$str = preg_replace("/<(\/?[^><]+?)([^A-Za-z<>\-])(.*?)(".implode('|', $attribs).")(.*?)([\s><])([><]*)/i", '<$1 $3$5$6$7', $str, -1, $count);
}
} while ($count);
return $str;
}
/**
* Is ajax Request?
*
* Test to see if a request contains the HTTP_X_REQUESTED_WITH header
*
* @return boolean
*/
public function is_ajax_request()
{
return (self::server('HTTP_X_REQUESTED_WITH') === 'XMLHttpRequest');
}
/**
* Is cli Request?
*
* Test to see if a request was made from the command line
*
* @return bool
*/
public function is_cli_request()
{
return (php_sapi_name() === 'cli' OR defined('STDIN'));
}
}
@amad
Copy link
Author

amad commented Feb 8, 2013

Purge::xss_clean($data);

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment