Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@jasny
Created June 26, 2014 10:41
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jasny/4ee1f046400e672e2d39 to your computer and use it in GitHub Desktop.
Save jasny/4ee1f046400e672e2d39 to your computer and use it in GitHub Desktop.
Stream filter to convert HTML5 to XML
<?php
/**
* Stream filter to convert HTML5 to XML.
*
* <code>
* $dsn = "php://filter/read=htmltoxml/resource=" . $url;
* $xml = XMLReader::open($dsn);
* </code>
*
* @todo Fix partial tag in bucket
*/
class HTMLToXMLFilter extends php_user_filter
{
/** Decode HTML entities */
const ENTITIES = 1;
/** Self close elements */
const SELFCLOSE = 2;
/**
* XML entity list
* @var array
*/
public static $entitylist = array(
'"' => '&quot;',
'&' => '&amp;',
'<' => '&lt;',
'>' => '&gt;',
);
/**
* HTML elements that should self close
* @var array
*/
public static $closeTags = array(
'area',
'base',
'br',
'col',
'command',
'embed',
'hr',
'img',
'input',
'keygen',
'link',
'meta',
'param',
'source',
'track',
'wbr'
);
/**
* @var int
*/
protected $mode;
/**
* Called when the filter is created
*
* @return boolean
*/
public function onCreate()
{
switch ($this->filtername) {
case 'htmltoxml':
$this->mode = self::ENTITIES | self::SELFCLOSE;
break;
case 'htmltoxml.entities':
$this->mode = self::ENTITIES;
break;
case 'htmltoxml.selfclose':
$this->mode = self::SELFCLOSE;
break;
default:
return false;
}
return true;
}
/**
* Filter function
*
* @param resource $in
* @param resource $out
* @param int $consumed
* @param bool $closing
* @return string
*/
public function filter($in, $out, &$consumed, $closing)
{
while ($bucket = stream_bucket_make_writeable($in)) {
$data = $bucket->data;
// Decode HTML entites
if ($this->mode & self::ENTITIES) {
$data = preg_replace_callback('/&(?:\w+;)?/', array($this, 'decode'), $data);
}
// Self close HTML elements
if ($this->mode & self::SELFCLOSE) {
$close_regex = '~<' . join('|', static::$closeTags) . '\s[^>]++(?<!/)>~';
$data = preg_replace_callback($close_regex, array($this, 'selfClose'), $data);
}
$bucket->data = $data;
$consumed += $bucket->datalen;
stream_bucket_append($out, $bucket);
}
return PSFS_PASS_ON;
}
/**
* Replace callback to decode HTML entities
*
* @param array $matches
* @return string
*/
public function decode($matches)
{
list($entity) = $matches;
if ($entity === '&') return '&amp;';
if (in_array($entity, static::$entitylist)) return $entity;
return html_entity_decode($entity);
}
/**
* Self close element
*
* @param array $matches
* @return string
*/
public function selfClose($matches)
{
return substr($matches[0], 0, -1) . '/>';
}
}
stream_filter_register("htmltoxml", "HTMLToXMLFilter");
stream_filter_register("htmltoxml.*", "HTMLToXMLFilter");
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment