Skip to content

Instantly share code, notes, and snippets.

@hakre
Last active December 15, 2015 03:29
Show Gist options
  • Save hakre/5194634 to your computer and use it in GitHub Desktop.
Save hakre/5194634 to your computer and use it in GitHub Desktop.
Inspect and modify character encoding of an XML document based on XML Declaration and BOM.
<?php
/**
* XMLRecoder
*
* Utility class to extract and change encoding information stored
* inside an XML declaration and to recode an XML string
*
* Requires: iconv for string recoding (iconv is available by default)
* <http://php.net/iconv>
*
* For character set names valid in the XML Declaration see:
* <http://www.iana.org/assignments/character-sets/character-sets.xml>
*
* For character set names valid in iconv see:
* <http://www.gnu.org/software/libiconv/>
*
* You must take names that are valid in both if the XML declaration exists
* when re-encoding an XML string.
*
* @author hakre <http://hakre.wordpress.com>
* @license AGPL-3.0 <http://spdx.org/licenses/AGPL-3.0>
*/
class XMLRecoder
{
const BOM_UTF_8 = "\xEF\xBB\xBF";
const BOM_UTF_32LE = "\xFF\xFE\x00\x00";
const BOM_UTF_16LE = "\xFF\xFE";
const BOM_UTF_32BE = "\x00\x00\xFE\xFF";
const BOM_UTF_16BE = "\xFE\xFF";
/**
* pcre pattern to access EncodingDecl, see <http://www.w3.org/TR/REC-xml/#sec-prolog-dtd>
*/
const DECL_PATTERN = '(^<\?xml\s+version\s*=\s*(["\'])(1\.\d+)\1\s+encoding\s*=\s*(["\'])(((?!\3).)*)\3)';
const DECL_ENC_GROUP = 4;
const DECL_VERSION_ONLY = '(^<\?xml\s+version\s*=\s*(["\'])(1\.\d+)\1(\s*)\?>)';
const DECL_VER_GROUP = 3;
const ENC_PATTERN = '(^[A-Za-z][A-Za-z0-9._-]*$)';
/**
* @param string $fromEncoding encoding to recode from. if not specified (NULL) taken from XML declaration
* and if missing in there then defaults to UTF-8
* @param string $toEncoding encoding to recode into
* @param $string
*
* @throws InvalidArgumentException
* @throws UnexpectedValueException
* @internal param string $buffer
* @return string
*/
public function recodeXMLString($fromEncoding, $toEncoding, $string) {
$buffer = $string;
($result = preg_match(self::DECL_PATTERN, $buffer, $matches, PREG_OFFSET_CAPTURE))
&& $result = $matches[self::DECL_ENC_GROUP];
if (null === $fromEncoding)
{
if (!$result)
{
$bomBuffer = substr($buffer, 0, 4);
$fromEncoding = $this->detectEncodingViaBom($bomBuffer, 'UTF-8');
} else
{
$fromEncoding = $result[0];
}
}
$buffer = iconv($fromEncoding, $toEncoding, $buffer);
if (FALSE === $buffer)
{
throw new UnexpectedValueException(
sprintf('Can not recode string from "%s" to "%s".', $fromEncoding, $toEncoding)
);
}
($result = preg_match(self::DECL_PATTERN, $buffer, $matches, PREG_OFFSET_CAPTURE))
&& $result = $matches[self::DECL_ENC_GROUP];
if ($result)
{
if (!preg_match(self::ENC_PATTERN, $toEncoding))
{
throw new InvalidArgumentException(sprintf('Invalid target encoding for XML declaration: "%s"', $toEncoding));
}
$buffer = substr_replace($buffer, strtoupper($toEncoding), $result[1], strlen($result[0]));
}
return $buffer;
}
/**
* @param string $string string (recommended length 4 characters/octets)
* @param string $default (optional) if none detected what to return
*
* @return string Encoding, if it can not be detected defaults $default (NULL)
* @throws InvalidArgumentException
*/
public function detectEncodingViaBom($string, $default = NULL) {
$len = strlen($string);
if ($len > 4)
{
$string = substr($string, 0, 4);
} elseif ($len < 4)
{
throw new InvalidArgumentException(sprintf("Need at least four characters, %d given.", $len));
}
switch (true)
{
case $string === self::BOM_UTF_16BE . $string[2] . $string[3]:
return "UTF-16BE";
case $string === self::BOM_UTF_8 . $string[3]:
return "UTF-8";
case $string === self::BOM_UTF_32LE:
return "UTF-32LE";
case $string === self::BOM_UTF_16LE . $string[2] . $string[3]:
return "UTF-16LE";
case $string === self::BOM_UTF_32BE:
return "UTF-32BE";
}
return $default;
}
/**
* @param string $buffer
*
* @return string
*/
public function removeUTF8Bom($buffer) {
if (self::BOM_UTF_8 === substr($buffer, 0, 3))
{
$buffer = substr($buffer, 3);
}
return $buffer;
}
/**
* get encoding attribute value from XML Declaration
*
* @param $string
* @param null $default (optional) default value to return if no encoding is set
*
* @return string|null null ($default) if encoding does not exist in processing instruction
*/
public function getEncodingDeclaration($string, $default = NULL) {
return preg_match(self::DECL_PATTERN, $string, $matches) ? $matches[self::DECL_ENC_GROUP] : $default;
}
/**
* sets the XML Declaration encoding attribute value (EncName of EncodingDecl)
*
* @param $string
* @param $toEncoding
*
* @return mixed
* @throws InvalidArgumentException
*/
public function setEncodingDeclaration($string, $toEncoding) {
if (!preg_match(self::ENC_PATTERN, $toEncoding))
{
throw new InvalidArgumentException(sprintf('Invalid target encoding for XML declaration: "%s"', $toEncoding));
}
($result = preg_match(self::DECL_PATTERN, $string, $matches, PREG_OFFSET_CAPTURE))
&& $result = $matches[self::DECL_ENC_GROUP];
if ($result)
{
return substr_replace($string, strtoupper($toEncoding), $result[1], strlen($result[0]));
}
($result = preg_match(self::DECL_VERSION_ONLY, $string, $matches, PREG_OFFSET_CAPTURE))
&& $result = $matches[self::DECL_VER_GROUP];
if ($result)
{
return substr_replace($string, sprintf(' encoding="%s"%s', strtoupper($toEncoding), $result[0]), $result[1], strlen($result[0]));
}
return substr_replace($string, sprintf('<?xml version="1.0" encoding="%s"?>', strtoupper($toEncoding)), 0, 0);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment