Created
June 26, 2014 10:41
-
-
Save jasny/4ee1f046400e672e2d39 to your computer and use it in GitHub Desktop.
Stream filter to convert HTML5 to XML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Stream filter to convert HTML5 to XML. | |
* | |
* <code> | |
* $dsn = "php://filter/read=htmltoxml/resource=" . $url; | |
* $xml = XMLReader::open($dsn); | |
* </code> | |
* | |
* @todo Fix partial tag in bucket | |
*/ | |
class HTMLToXMLFilter extends php_user_filter | |
{ | |
/** Decode HTML entities */ | |
const ENTITIES = 1; | |
/** Self close elements */ | |
const SELFCLOSE = 2; | |
/** | |
* XML entity list | |
* @var array | |
*/ | |
public static $entitylist = array( | |
'"' => '"', | |
'&' => '&', | |
'<' => '<', | |
'>' => '>', | |
); | |
/** | |
* HTML elements that should self close | |
* @var array | |
*/ | |
public static $closeTags = array( | |
'area', | |
'base', | |
'br', | |
'col', | |
'command', | |
'embed', | |
'hr', | |
'img', | |
'input', | |
'keygen', | |
'link', | |
'meta', | |
'param', | |
'source', | |
'track', | |
'wbr' | |
); | |
/** | |
* @var int | |
*/ | |
protected $mode; | |
/** | |
* Called when the filter is created | |
* | |
* @return boolean | |
*/ | |
public function onCreate() | |
{ | |
switch ($this->filtername) { | |
case 'htmltoxml': | |
$this->mode = self::ENTITIES | self::SELFCLOSE; | |
break; | |
case 'htmltoxml.entities': | |
$this->mode = self::ENTITIES; | |
break; | |
case 'htmltoxml.selfclose': | |
$this->mode = self::SELFCLOSE; | |
break; | |
default: | |
return false; | |
} | |
return true; | |
} | |
/** | |
* Filter function | |
* | |
* @param resource $in | |
* @param resource $out | |
* @param int $consumed | |
* @param bool $closing | |
* @return string | |
*/ | |
public function filter($in, $out, &$consumed, $closing) | |
{ | |
while ($bucket = stream_bucket_make_writeable($in)) { | |
$data = $bucket->data; | |
// Decode HTML entites | |
if ($this->mode & self::ENTITIES) { | |
$data = preg_replace_callback('/&(?:\w+;)?/', array($this, 'decode'), $data); | |
} | |
// Self close HTML elements | |
if ($this->mode & self::SELFCLOSE) { | |
$close_regex = '~<' . join('|', static::$closeTags) . '\s[^>]++(?<!/)>~'; | |
$data = preg_replace_callback($close_regex, array($this, 'selfClose'), $data); | |
} | |
$bucket->data = $data; | |
$consumed += $bucket->datalen; | |
stream_bucket_append($out, $bucket); | |
} | |
return PSFS_PASS_ON; | |
} | |
/** | |
* Replace callback to decode HTML entities | |
* | |
* @param array $matches | |
* @return string | |
*/ | |
public function decode($matches) | |
{ | |
list($entity) = $matches; | |
if ($entity === '&') return '&'; | |
if (in_array($entity, static::$entitylist)) return $entity; | |
return html_entity_decode($entity); | |
} | |
/** | |
* Self close element | |
* | |
* @param array $matches | |
* @return string | |
*/ | |
public function selfClose($matches) | |
{ | |
return substr($matches[0], 0, -1) . '/>'; | |
} | |
} | |
stream_filter_register("htmltoxml", "HTMLToXMLFilter"); | |
stream_filter_register("htmltoxml.*", "HTMLToXMLFilter"); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment