Skip to content

Instantly share code, notes, and snippets.

@ivuorinen
Created September 20, 2015 07:56
Show Gist options
  • Save ivuorinen/bce25b37d3d3aadd634e to your computer and use it in GitHub Desktop.
Save ivuorinen/bce25b37d3d3aadd634e to your computer and use it in GitHub Desktop.
psr2-fixed, docblocked and 80-char width safe version
<?php
/**
* makeSafeEntities()
*
* Convert str to UTF-8 (if not already), then convert that to HTML named
* entities and numbered references.
* Compare to native htmlentities() function. Unlike that function,
* this will skip any already existing entities in the string.
*
* - mb_convert_encoding() doesn't encode ampersands, so use
* makeAmpersandEntities to convert those.
* - mb_convert_encoding() won't usually convert to illegal
* numbered entities (128-159) unless there's a charset discrepancy,
* but just in case, correct them with correctIllegalEntities.
*
* @author Cameron Clark <cameron@prolifique.com>
* @see http://www.prolifique.com/entities.php.txt
*
* @param string|array $str
* @param bool $convertTags
* @param string $encoding
*
* @return string|array
*/
function makeSafeEntities($str, $convertTags = 0, $encoding = "")
{
if (empty($str)) {
return '';
}
if (is_array($arrOutput = $str)) {
$arrOutput = array();
foreach (array_keys($arrOutput) as $key) {
$arrOutput[$key] = makeSafeEntities(
$arrOutput[$key],
$encoding
);
}
return $arrOutput;
}
$str = makeUTF8($str, $encoding);
$str = mb_convert_encoding(
$str,
"HTML-ENTITIES",
"UTF-8"
);
$str = makeAmpersandEntities($str);
if ($convertTags) {
$str = makeTagEntities($str);
}
$str = correctIllegalEntities($str);
return $str;
}
/**
* makeAllEntities()
*
* Convert str to UTF-8 (if not already), then convert to HTML numbered
* decimal entities. If selected, it first converts any illegal chars to
* safe named (and numbered) entities as in makeSafeEntities().
* Unlike mb_convert_encoding(), mb_encode_numericentity() will NOT skip
* any already existing entities in the string, so use a regex to skip them.
*
* @author Cameron Clark <cameron@prolifique.com>
* @see http://www.prolifique.com/entities.php.txt
*
* @param string $str String to convert
* @param bool $useNamedEntities True to use named entities
* @param string $encoding Encoding to use
*
* @return string
*/
function makeAllEntities($str, $useNamedEntities = 0, $encoding = "")
{
if (empty($str)) {
return '';
}
if (is_array($str)) {
foreach ($str as $row) {
$arrOutput[] = makeAllEntities(
$row,
$encoding
);
}
return $arrOutput;
}
$str = makeUTF8($str, $encoding);
if ($useNamedEntities) {
$str = mb_convert_encoding(
$str,
"HTML-ENTITIES",
"UTF-8"
);
}
$str = makeTagEntities($str, $useNamedEntities);
// Fix backslashes so they don't screw up following mb_ereg_replace
// Single quotes are fixed by makeTagEntities() above
$str = mb_ereg_replace('\\\\', "&#92;", $str);
mb_regex_encoding("UTF-8");
$str = mb_ereg_replace(
"(?>(&(?:[a-z]{0,4}\w{2,3};|#\d{2,5};)))|(\S+?)",
"'\\1'.mb_encode_numericentity('\\2', "
. "array(0x0, 0x2FFFF, 0, 0xFFFF), 'UTF-8')",
$str,
"ime"
);
$str = correctIllegalEntities($str);
return $str;
}
/**
* makeTagEntities()
*
* Convert common characters to named or numbered entities
*
* @author Cameron Clark <cameron@prolifique.com>
* @see http://www.prolifique.com/entities.php.txt
*
* @param string $str String to convert
* @param boolean $useNamedEntities Use "&amp;" or "&#38;"
* @return string
*/
function makeTagEntities($str = '', $useNamedEntities = 1)
{
// Note that we should use &apos; for the single quote,
// but IE doesn't like it
$arrReplace = $useNamedEntities
? array('&#39;','&quot;','&lt;','&gt;')
: array('&#39;','&#34;','&#60;','&#62;');
return str_replace(
array("'", '"', '<', '>'),
$arrReplace,
$str
);
}
/**
* makeAmpersandEntities()
*
* Convert ampersands to named or numbered entities.
* Use regex to skip any that might be part of existing entities.
*
* @author Cameron Clark <cameron@prolifique.com>
* @see http://www.prolifique.com/entities.php.txt
*
* @param string $str String to convert
* @param boolean $useNamedEntities Use "&amp;" or "&#38;"
* @return string
*/
function makeAmpersandEntities($str = '', $useNamedEntities = 1)
{
return preg_replace(
"/&(?![A-Za-z]{0,4}\w{2,3};|#[0-9]{2,5};)/m",
$useNamedEntities ? "&amp;" : "&#38;",
$str
);
}
/**
* correctIllegalEntities()
*
* Convert illegal HTML numbered entities in the
* range 128 - 159 to legal couterparts
*
* @author Cameron Clark <cameron@prolifique.com>
* @see http://www.prolifique.com/entities.php.txt
*
* @param string $str String to correct
* @return string
*/
function correctIllegalEntities($str = '')
{
if (empty($str)) {
return '';
}
$chars = array(
128 => '&#8364;',
130 => '&#8218;',
131 => '&#402;',
132 => '&#8222;',
133 => '&#8230;',
134 => '&#8224;',
135 => '&#8225;',
136 => '&#710;',
137 => '&#8240;',
138 => '&#352;',
139 => '&#8249;',
140 => '&#338;',
142 => '&#381;',
145 => '&#8216;',
146 => '&#8217;',
147 => '&#8220;',
148 => '&#8221;',
149 => '&#8226;',
150 => '&#8211;',
151 => '&#8212;',
152 => '&#732;',
153 => '&#8482;',
154 => '&#353;',
155 => '&#8250;',
156 => '&#339;',
158 => '&#382;',
159 => '&#376;'
);
foreach (array_keys($chars) as $num) {
$str = str_replace(
"&#" . $num . ";",
$chars[$num],
$str
);
}
return $str;
}
/**
* makeUTF8()
*
* Compare to native utf8_encode function, which will re-encode text that
* is already UTF-8. Some people have reported problems with this.
* You might consider rearranging the order here to try mb_detect_encoding
* first, then fall back to using isUTF8 if that doesn't work.
*
* @author Cameron Clark <cameron@prolifique.com>
* @see http://www.prolifique.com/entities.php.txt
*
* @param string $str String to convert
* @param string $encoding Encoding to convert to
*
* @return string
*/
function makeUTF8($str = '', $encoding = "")
{
if (empty($str)) {
return '';
}
if (empty($encoding) && isUTF8($str)) {
$encoding = "UTF-8";
}
if (empty($encoding)) {
$encoding = mb_detect_encoding(
$str,
'UTF-8, ISO-8859-1'
);
}
if (empty($encoding)) {
// If charset can't be detected, default to ISO-8859-1
$encoding = "ISO-8859-1";
}
return ($encoding == "UTF-8"
? $str
: @mb_convert_encoding(
$str,
"UTF-8",
$encoding
)
);
}
/**
* isUTF8()
*
* Much simpler UTF-8-ness checker using a regular expression created
* by the W3C: Returns true if $string is valid UTF-8 and false otherwise.
* From http://w3.org/International/questions/qa-forms-utf-8.html
*
* @author Cameron Clark <cameron@prolifique.com>
* @see http://www.prolifique.com/entities.php.txt
*
* @param string $string String to test
* @return boolean
*/
function isUTF8($str = '')
{
return preg_match(
'%^(?:
[\x09\x0A\x0D\x20-\x7E] # ASCII
| [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
| \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
| [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte
| \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
| \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
| [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
| \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
)*$%xs',
$str
);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment