Last active
December 15, 2015 03:29
-
-
Save hakre/5194634 to your computer and use it in GitHub Desktop.
Inspect and modify character encoding of an XML document based on XML Declaration and BOM.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* XMLRecoder | |
* | |
* Utility class to extract and change encoding information stored | |
* inside an XML declaration and to recode an XML string | |
* | |
* Requires: iconv for string recoding (iconv is available by default) | |
* <http://php.net/iconv> | |
* | |
* For character set names valid in the XML Declaration see: | |
* <http://www.iana.org/assignments/character-sets/character-sets.xml> | |
* | |
* For character set names valid in iconv see: | |
* <http://www.gnu.org/software/libiconv/> | |
* | |
* You must take names that are valid in both if the XML declaration exists | |
* when re-encoding an XML string. | |
* | |
* @author hakre <http://hakre.wordpress.com> | |
* @license AGPL-3.0 <http://spdx.org/licenses/AGPL-3.0> | |
*/ | |
class XMLRecoder | |
{ | |
const BOM_UTF_8 = "\xEF\xBB\xBF"; | |
const BOM_UTF_32LE = "\xFF\xFE\x00\x00"; | |
const BOM_UTF_16LE = "\xFF\xFE"; | |
const BOM_UTF_32BE = "\x00\x00\xFE\xFF"; | |
const BOM_UTF_16BE = "\xFE\xFF"; | |
/** | |
* pcre pattern to access EncodingDecl, see <http://www.w3.org/TR/REC-xml/#sec-prolog-dtd> | |
*/ | |
const DECL_PATTERN = '(^<\?xml\s+version\s*=\s*(["\'])(1\.\d+)\1\s+encoding\s*=\s*(["\'])(((?!\3).)*)\3)'; | |
const DECL_ENC_GROUP = 4; | |
const DECL_VERSION_ONLY = '(^<\?xml\s+version\s*=\s*(["\'])(1\.\d+)\1(\s*)\?>)'; | |
const DECL_VER_GROUP = 3; | |
const ENC_PATTERN = '(^[A-Za-z][A-Za-z0-9._-]*$)'; | |
/** | |
* @param string $fromEncoding encoding to recode from. if not specified (NULL) taken from XML declaration | |
* and if missing in there then defaults to UTF-8 | |
* @param string $toEncoding encoding to recode into | |
* @param $string | |
* | |
* @throws InvalidArgumentException | |
* @throws UnexpectedValueException | |
* @internal param string $buffer | |
* @return string | |
*/ | |
public function recodeXMLString($fromEncoding, $toEncoding, $string) { | |
$buffer = $string; | |
($result = preg_match(self::DECL_PATTERN, $buffer, $matches, PREG_OFFSET_CAPTURE)) | |
&& $result = $matches[self::DECL_ENC_GROUP]; | |
if (null === $fromEncoding) | |
{ | |
if (!$result) | |
{ | |
$bomBuffer = substr($buffer, 0, 4); | |
$fromEncoding = $this->detectEncodingViaBom($bomBuffer, 'UTF-8'); | |
} else | |
{ | |
$fromEncoding = $result[0]; | |
} | |
} | |
$buffer = iconv($fromEncoding, $toEncoding, $buffer); | |
if (FALSE === $buffer) | |
{ | |
throw new UnexpectedValueException( | |
sprintf('Can not recode string from "%s" to "%s".', $fromEncoding, $toEncoding) | |
); | |
} | |
($result = preg_match(self::DECL_PATTERN, $buffer, $matches, PREG_OFFSET_CAPTURE)) | |
&& $result = $matches[self::DECL_ENC_GROUP]; | |
if ($result) | |
{ | |
if (!preg_match(self::ENC_PATTERN, $toEncoding)) | |
{ | |
throw new InvalidArgumentException(sprintf('Invalid target encoding for XML declaration: "%s"', $toEncoding)); | |
} | |
$buffer = substr_replace($buffer, strtoupper($toEncoding), $result[1], strlen($result[0])); | |
} | |
return $buffer; | |
} | |
/** | |
* @param string $string string (recommended length 4 characters/octets) | |
* @param string $default (optional) if none detected what to return | |
* | |
* @return string Encoding, if it can not be detected defaults $default (NULL) | |
* @throws InvalidArgumentException | |
*/ | |
public function detectEncodingViaBom($string, $default = NULL) { | |
$len = strlen($string); | |
if ($len > 4) | |
{ | |
$string = substr($string, 0, 4); | |
} elseif ($len < 4) | |
{ | |
throw new InvalidArgumentException(sprintf("Need at least four characters, %d given.", $len)); | |
} | |
switch (true) | |
{ | |
case $string === self::BOM_UTF_16BE . $string[2] . $string[3]: | |
return "UTF-16BE"; | |
case $string === self::BOM_UTF_8 . $string[3]: | |
return "UTF-8"; | |
case $string === self::BOM_UTF_32LE: | |
return "UTF-32LE"; | |
case $string === self::BOM_UTF_16LE . $string[2] . $string[3]: | |
return "UTF-16LE"; | |
case $string === self::BOM_UTF_32BE: | |
return "UTF-32BE"; | |
} | |
return $default; | |
} | |
/** | |
* @param string $buffer | |
* | |
* @return string | |
*/ | |
public function removeUTF8Bom($buffer) { | |
if (self::BOM_UTF_8 === substr($buffer, 0, 3)) | |
{ | |
$buffer = substr($buffer, 3); | |
} | |
return $buffer; | |
} | |
/** | |
* get encoding attribute value from XML Declaration | |
* | |
* @param $string | |
* @param null $default (optional) default value to return if no encoding is set | |
* | |
* @return string|null null ($default) if encoding does not exist in processing instruction | |
*/ | |
public function getEncodingDeclaration($string, $default = NULL) { | |
return preg_match(self::DECL_PATTERN, $string, $matches) ? $matches[self::DECL_ENC_GROUP] : $default; | |
} | |
/** | |
* sets the XML Declaration encoding attribute value (EncName of EncodingDecl) | |
* | |
* @param $string | |
* @param $toEncoding | |
* | |
* @return mixed | |
* @throws InvalidArgumentException | |
*/ | |
public function setEncodingDeclaration($string, $toEncoding) { | |
if (!preg_match(self::ENC_PATTERN, $toEncoding)) | |
{ | |
throw new InvalidArgumentException(sprintf('Invalid target encoding for XML declaration: "%s"', $toEncoding)); | |
} | |
($result = preg_match(self::DECL_PATTERN, $string, $matches, PREG_OFFSET_CAPTURE)) | |
&& $result = $matches[self::DECL_ENC_GROUP]; | |
if ($result) | |
{ | |
return substr_replace($string, strtoupper($toEncoding), $result[1], strlen($result[0])); | |
} | |
($result = preg_match(self::DECL_VERSION_ONLY, $string, $matches, PREG_OFFSET_CAPTURE)) | |
&& $result = $matches[self::DECL_VER_GROUP]; | |
if ($result) | |
{ | |
return substr_replace($string, sprintf(' encoding="%s"%s', strtoupper($toEncoding), $result[0]), $result[1], strlen($result[0])); | |
} | |
return substr_replace($string, sprintf('<?xml version="1.0" encoding="%s"?>', strtoupper($toEncoding)), 0, 0); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment