kaz29/HTMLScraping.class.php.fileschema.patch

## HTMLScraping.class.php.fileschema.patch
228c228
<         if (!preg_match('/^(https?|file):\/\/+/i', $url)) {
---
>         if (!preg_match('/^https?:\/\/\w[\w\-\.]+/i', $url)) {
231d230
<
233,434c232,239
<         if (preg_match('/^https?:\/\/\w[\w\-\.]+/i', $url)) {
<           $cache_lifetime = (int) $cache_lifetime;
<           $use_cache = !empty($this->cacheDir) and $cache_lifetime > 0;
<           if ($use_cache) {
<               $cache = new Cache_Lite(array('cacheDir' => $this->cacheDir, 'lifeTime' => $cache_lifetime));
<               $params = array();
<               foreach ($headers as $key => $value) {
<                   if (!empty($value)) {
<                       $params[] = urlencode($key).'='.urlencode($value);
<                   }
<               }
<               foreach ($post as $key => $value) {
<                   $params[] = urlencode($key).'='.urlencode($value);
<               }
<               $cache_id = "$url?".implode('&', $params);
<               if (false !== $data = $cache->get($cache_id)) {
<                   $data = unserialize($data);
<               }
<           }
<           /*
<            * Access to the URL if not cached
<            * or if the cache has either Last-Modified or Etag header
<            * and conditional request is specified.
<            */
<           if ($conditional_request and (!isset($data['headers']['last-modified']) or !isset($data['headers']['etag']))) {
<               $conditional_request = false;
<           }
<           if (!$data or $conditional_request) {
<               if (isset($data['headers']['last-modified'])
<                   and (!isset($headers['last-modified']) or empty($headers['last-modified']))) {
<                   $headers['last-modified'] = $data['headers']['last-modified'];
<               }
<               if (isset($data['headers']['etag'])
<                   and (!isset($headers['etag']) or empty($headers['etag']))) {
<                   $headers['etag'] = $data['headers']['etag'];
<               }
<               try {
<                   $response = $this->getHttpResponse($url, $headers, $post);
<               } catch (Exception $e) {
<                   if (!$data) {
<                       throw $e;
<                   }
<               }
<               /*
<                * Use cache if the responded HTTP status code is 304.
<                * If 200, format the responded HTML of the given URL to XHTML.
<                */
<               if (!$data or (isset($response['code']) and $response['code'] != 304)) {
<                   $data =& $response;
<                   /*
<                    * If status code was 200 and Content-Type was not (X)HTML,
<                    * the status code was forcibly altered to 204.
<                    * @see HTTP_Request_Listener_Extended->update().
<                    */
<                   if ($data['code'] != 200 and $data['code'] != 204) {
<                       throw new Exception("Responded HTTP Status Code is $data[code].");
<                   } elseif (isset($data['headers']['content-type'])
<                       and !preg_match('/^(?:text|application)\/x?html\b/', $data['headers']['content-type'])) {
<                       throw new Exception("Responded Content-Type is {$data['headers']['content-type']}");
<                   } elseif (empty($data['body'])) {
<                       throw new Exception("Responded entity body is empty.");
<                   } elseif (!preg_match('/<\w+[^>]*?>/', $data['body'], $matches)) {
<                       throw new Exception("Responded entity body does not contain a markup symbol.");
<                   } elseif (false !== strpos($matches[0], "\x0")) {
<                       throw new Exception("Responded entity body contains NULL.");
<                   }
<
<                   $this->analyzeBody($data) ;
<
<                   if ($use_cache) {
<                       $cache->save(serialize($data), $cache_id);
<                   }
<               }
<           }
<         } else {
<           $path = substr($url,7);
<           if ( !file_exists($path) || !is_readable($path) ) {
<             throw new Exception("File not exists.");
<           }
<
<           $data = array() ;
<           $data['url'] = $url ;
<           $data['body'] = file_get_contents($path);
<
<           $this->analyzeBody($data) ;
<         }
<
<         return $data;
<     }
<
<     private function analyzeBody(&$data)
<     {
<           /*
<            * Remove BOM and NULLs.
<            */
<           $data['body'] = preg_replace('/^\xef\xbb\xbf/', '' , $data['body']);
<           $data['body'] = str_replace("\x0", '', $data['body']);
<           /*
<            * Initialize the backups.
<            */
<           $this->backup = array();
<           $this->backup_count = 0;
<         /*
<          * Removing SCRIPT and STYLE is recommended.
<          * The following substitute code will capsulate the content of the tags in CDATA.
<          * If use it, be sure that some JavaScript method such as document.write
<          * is not compliant with XHTML/XML.
<          */
<         $tags = array('script', 'style');
<         foreach ($tags as $tag) {
<             $data['body'] = preg_replace("/<$tag\b[^>]*?>.*?<\/$tag\b[^>]*?>/si", '' , $data['body']);
<             /*
<             $data['body'] = preg_replace_callback(
<                 "/(<$tag\b[^>]*?>)(.*?)(<\/$tag\b[^>]*?>)/si",
<                 create_function('$matches', '
<                     $content = trim($matches[2]);
<                     if (empty($content)
<                         or preg_match("/^<!\[CDATA\[.*?\]\]>$/s", $content)) {
<                         return $matches[0];
<                     } else {
<                         $content = preg_replace("/^<!-+/", "", $content);
<                         $content = preg_replace("/-+>$/", "", $content);
<                         $content = preg_replace("/\s*\/\/$/s", "", trim($content));
<                         return "$matches[1]<![CDATA[\n$content\n]]>$matches[3]";
<                     }
<                 '),
<                 $data['body']
<             );
<             */
<         }
<         /*
<          * Backup CDATA sections for later process.
<          */
<         $data['body'] = preg_replace_callback(
<             '/<!\[CDATA\[.*?\]\]>/s', array($this, 'backup'), $data['body']
<         );
<         /*
<          * Comment section must not contain two or more adjacent hyphens.
<          */
<         $data['body'] = preg_replace_callback(
<             '/<!--(.*?)-->/si',
<             create_function('$matches', '
<                 return "<!-- ".preg_replace("/-{2,}/", "-", $matches[1])." -->";
<             '),
<             $data['body']
<         );
<         /*
<          * Backup comment sections for later process.
<          */
<         $data['body'] = preg_replace_callback(
<             '/<!--.*?-->/s', array($this, 'backup'), $data['body']
<         );
<         /*
<          * Process tags that is potentially dangerous for XML parsers.
<          */
<         $data['body'] = preg_replace_callback(
<             '/(<textarea\b[^>]*?>)(.*?)(<\/textarea\b[^>]*?>)/si',
<             create_function('$matches', '
<                 return $matches[1].str_replace("<", "&lt;", $matches[2]).$matches[3];
<             '),
<             $data['body']
<         );
<         $data['body'] = preg_replace_callback(
<             '/<xmp\b[^>]*?>(.*?)<\/xmp\b[^>]*?>/si',
<             create_function('$matches', '
<                 return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
<             '),
<             $data['body']
<         );
<         $data['body'] = preg_replace_callback(
<             '/<plaintext\b[^>]*?>(.*)$/si',
<             create_function('$matches', '
<                 return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
<             '),
<             $data['body']
<         );
<         /*
<          * Remove DTD declarations, wrongly placed comments etc.
<          * This must be done before removing DOCTYPE.
<          */
<         $data['body'] = preg_replace('/<!(?!DOCTYPE)[^>]*?>/si', '', $data['body']);
<         /*
<          * XML and DOCTYPE declaration will be replaced.
<          */
<         $data['body'] = preg_replace('/<!DOCTYPE\b[^>]*?>/si', '', $data['body']);
<         $data['body'] = preg_replace('/<\?xml\b[^>]*?\?>/si', '', $data['body']);
<         if (preg_match('/^\s*$/s', $data['body'])) {
<             throw new Exception('The entity body became empty after preprocessing.');
<         }
<         /*
<          * Detect character encoding and convert to UTF-8.
<          */
<         $encoding = false;
<         if (isset($data['headers']['content-type'])) {
<             $encoding = $this->getCharsetFromCType($data['headers']['content-type']);
<         }
<         if (!$encoding and preg_match_all('/<meta\b[^>]*?>/si', $data['body'], $matches)) {
<             foreach ($matches[0] as $value) {
<                 if (strtolower($this->getAttribute('http-equiv', $value)) == 'content-type'
<                     and false !== $encoding = $this->getAttribute('content', $value)) {
<                     $encoding = $this->getCharsetFromCType($encoding);
<                     break;
---
>         $cache_lifetime = (int) $cache_lifetime;
>         $use_cache = !empty($this->cacheDir) and $cache_lifetime > 0;
>         if ($use_cache) {
>             $cache = new Cache_Lite(array('cacheDir' => $this->cacheDir, 'lifeTime' => $cache_lifetime));
>             $params = array();
>             foreach ($headers as $key => $value) {
>                 if (!empty($value)) {
>                     $params[] = urlencode($key).'='.urlencode($value);
436a242,248
>             foreach ($post as $key => $value) {
>                 $params[] = urlencode($key).'='.urlencode($value);
>             }
>             $cache_id = "$url?".implode('&', $params);
>             if (false !== $data = $cache->get($cache_id)) {
>                 $data = unserialize($data);
>             }
439,441c251,253
<          * Use mbstring to convert character encoding if available.
<          * Otherwise use iconv (iconv may try to detect character encoding automatically).
<          * Do not trust the declared encoding and do conversion even if UTF-8.
---
>          * Access to the URL if not cached
>          * or if the cache has either Last-Modified or Etag header
>          * and conditional request is specified.
443,448c255,261
<         if (extension_loaded('mbstring')) {
<             if (!$encoding) {
<                 @mb_detect_order('ASCII, JIS, UTF-8, EUC-JP, SJIS');
<                 if (false === $encoding = @mb_preferred_mime_name(@mb_detect_encoding($data['body']))) {
<                     throw new Exception('Failed detecting character encoding.');
<                 }
---
>         if ($conditional_request and (!isset($data['headers']['last-modified']) or !isset($data['headers']['etag']))) {
>             $conditional_request = false;
>         }
>         if (!$data or $conditional_request) {
>             if (isset($data['headers']['last-modified'])
>                 and (!isset($headers['last-modified']) or empty($headers['last-modified']))) {
>                 $headers['last-modified'] = $data['headers']['last-modified'];
450,453c263,265
<             @mb_convert_variables('UTF-8', $encoding, $data, $this->backup);
<         } else {
<             if (false === $data['body'] = @iconv($encoding, 'UTF-8', $data['body'])) {
<                 throw new Exception('Failed converting character encoding.');
---
>             if (isset($data['headers']['etag'])
>                 and (!isset($headers['etag']) or empty($headers['etag']))) {
>                 $headers['etag'] = $data['headers']['etag'];
455,457c267,467
<             foreach ($this->backup as $key => $value) {
<                 if (false === $this->backup[$key] = @iconv($encoding, 'UTF-8', $value)) {
<                     throw new Exception('Failed converting character encoding.');
---
>             try {
>                 $response = $this->getHttpResponse($url, $headers, $post);
>             } catch (Exception $e) {
>                 if (!$data) {
>                     throw $e;
>                 }
>             }
>             /*
>              * Use cache if the responded HTTP status code is 304.
>              * If 200, format the responded HTML of the given URL to XHTML.
>              */
>             if (!$data or (isset($response['code']) and $response['code'] != 304)) {
>                 $data =& $response;
>                 /*
>                  * If status code was 200 and Content-Type was not (X)HTML,
>                  * the status code was forcibly altered to 204.
>                  * @see HTTP_Request_Listener_Extended->update().
>                  */
>                 if ($data['code'] != 200 and $data['code'] != 204) {
>                     throw new Exception("Responded HTTP Status Code is $data[code].");
>                 } elseif (isset($data['headers']['content-type'])
>                     and !preg_match('/^(?:text|application)\/x?html\b/', $data['headers']['content-type'])) {
>                     throw new Exception("Responded Content-Type is {$data['headers']['content-type']}");
>                 } elseif (empty($data['body'])) {
>                     throw new Exception("Responded entity body is empty.");
>                 } elseif (!preg_match('/<\w+[^>]*?>/', $data['body'], $matches)) {
>                     throw new Exception("Responded entity body does not contain a markup symbol.");
>                 } elseif (false !== strpos($matches[0], "\x0")) {
>                     throw new Exception("Responded entity body contains NULL.");
>                 }
>                 /*
>                  * Remove BOM and NULLs.
>                  */
>                 $data['body'] = preg_replace('/^\xef\xbb\xbf/', '' , $data['body']);
>                 $data['body'] = str_replace("\x0", '', $data['body']);
>                 /*
>                  * Initialize the backups.
>                  */
>                 $this->backup = array();
>                 $this->backup_count = 0;
>                 /*
>                  * Removing SCRIPT and STYLE is recommended.
>                  * The following substitute code will capsulate the content of the tags in CDATA.
>                  * If use it, be sure that some JavaScript method such as document.write
>                  * is not compliant with XHTML/XML.
>                  */
>                 $tags = array('script', 'style');
>                 foreach ($tags as $tag) {
>                     $data['body'] = preg_replace("/<$tag\b[^>]*?>.*?<\/$tag\b[^>]*?>/si", '' , $data['body']);
>                     /*
>                     $data['body'] = preg_replace_callback(
>                         "/(<$tag\b[^>]*?>)(.*?)(<\/$tag\b[^>]*?>)/si",
>                         create_function('$matches', '
>                             $content = trim($matches[2]);
>                             if (empty($content)
>                                 or preg_match("/^<!\[CDATA\[.*?\]\]>$/s", $content)) {
>                                 return $matches[0];
>                             } else {
>                                 $content = preg_replace("/^<!-+/", "", $content);
>                                 $content = preg_replace("/-+>$/", "", $content);
>                                 $content = preg_replace("/\s*\/\/$/s", "", trim($content));
>                                 return "$matches[1]<![CDATA[\n$content\n]]>$matches[3]";
>                             }
>                         '),
>                         $data['body']
>                     );
>                     */
>                 }
>                 /*
>                  * Backup CDATA sections for later process.
>                  */
>                 $data['body'] = preg_replace_callback(
>                     '/<!\[CDATA\[.*?\]\]>/s', array($this, 'backup'), $data['body']
>                 );
>                 /*
>                  * Comment section must not contain two or more adjacent hyphens.
>                  */
>                 $data['body'] = preg_replace_callback(
>                     '/<!--(.*?)-->/si',
>                     create_function('$matches', '
>                         return "<!-- ".preg_replace("/-{2,}/", "-", $matches[1])." -->";
>                     '),
>                     $data['body']
>                 );
>                 /*
>                  * Backup comment sections for later process.
>                  */
>                 $data['body'] = preg_replace_callback(
>                     '/<!--.*?-->/s', array($this, 'backup'), $data['body']
>                 );
>                 /*
>                  * Process tags that is potentially dangerous for XML parsers.
>                  */
>                 $data['body'] = preg_replace_callback(
>                     '/(<textarea\b[^>]*?>)(.*?)(<\/textarea\b[^>]*?>)/si',
>                     create_function('$matches', '
>                         return $matches[1].str_replace("<", "&lt;", $matches[2]).$matches[3];
>                     '),
>                     $data['body']
>                 );
>                 $data['body'] = preg_replace_callback(
>                     '/<xmp\b[^>]*?>(.*?)<\/xmp\b[^>]*?>/si',
>                     create_function('$matches', '
>                         return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
>                     '),
>                     $data['body']
>                 );
>                 $data['body'] = preg_replace_callback(
>                     '/<plaintext\b[^>]*?>(.*)$/si',
>                     create_function('$matches', '
>                         return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
>                     '),
>                     $data['body']
>                 );
>                 /*
>                  * Remove DTD declarations, wrongly placed comments etc.
>                  * This must be done before removing DOCTYPE.
>                  */
>                 $data['body'] = preg_replace('/<!(?!DOCTYPE)[^>]*?>/si', '', $data['body']);
>                 /*
>                  * XML and DOCTYPE declaration will be replaced.
>                  */
>                 $data['body'] = preg_replace('/<!DOCTYPE\b[^>]*?>/si', '', $data['body']);
>                 $data['body'] = preg_replace('/<\?xml\b[^>]*?\?>/si', '', $data['body']);
>                 if (preg_match('/^\s*$/s', $data['body'])) {
>                     throw new Exception('The entity body became empty after preprocessing.');
>                 }
>                 /*
>                  * Detect character encoding and convert to UTF-8.
>                  */
>                 $encoding = false;
>                 if (isset($data['headers']['content-type'])) {
>                     $encoding = $this->getCharsetFromCType($data['headers']['content-type']);
>                 }
>                 if (!$encoding and preg_match_all('/<meta\b[^>]*?>/si', $data['body'], $matches)) {
>                     foreach ($matches[0] as $value) {
>                         if (strtolower($this->getAttribute('http-equiv', $value)) == 'content-type'
>                             and false !== $encoding = $this->getAttribute('content', $value)) {
>                             $encoding = $this->getCharsetFromCType($encoding);
>                             break;
>                         }
>                     }
>                 }
>                 /*
>                  * Use mbstring to convert character encoding if available.
>                  * Otherwise use iconv (iconv may try to detect character encoding automatically).
>                  * Do not trust the declared encoding and do conversion even if UTF-8.
>                  */
>                 if (extension_loaded('mbstring')) {
>                     if (!$encoding) {
>                         @mb_detect_order('ASCII, JIS, UTF-8, EUC-JP, SJIS');
>                         if (false === $encoding = @mb_preferred_mime_name(@mb_detect_encoding($data['body']))) {
>                             throw new Exception('Failed detecting character encoding.');
>                         }
>                     }
>                     @mb_convert_variables('UTF-8', $encoding, $data, $this->backup);
>                 } else {
>                     if (false === $data['body'] = @iconv($encoding, 'UTF-8', $data['body'])) {
>                         throw new Exception('Failed converting character encoding.');
>                     }
>                     foreach ($this->backup as $key => $value) {
>                         if (false === $this->backup[$key] = @iconv($encoding, 'UTF-8', $value)) {
>                             throw new Exception('Failed converting character encoding.');
>                         }
>                     }
>                 }
>                 /*
>                  * Restore CDATAs and comments.
>                  */
>                 for ($i = 0; $i < $this->backup_count; $i++) {
>                     $data['body'] = str_replace("<restore count=\"$i\" />", $this->backup[$i], $data['body']);
>                 }
>                 /*
>                  * Use Tidy to format HTML if available.
>                  * Otherwise, use HTMLParser class (is slower and consumes much memory).
>                  */
>                 if (extension_loaded('tidy')) {
>                     $tidy = new tidy;
>                     $tidy->parseString($data['body'], array('output-xhtml' => true), 'UTF8');
>                     $tidy->cleanRepair();
>                     $data['body'] = $tidy->html();
>                 } else {
>                     require_once 'HTMLParser.class.php';
>                     $parser = new HTMLParser;
>                     $format_rule = require 'xhtml1-transitional_dtd.inc.php';
>                     $parser->setRule($format_rule);
>                     $parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml'));
>                     $parser->setGenericParent('body');
>                     $parser->parse($data['body']);
>                     $data['body'] = $parser->dump();
>                 }
>                 /*
>                  * Valid XHTML DOCTYPE declaration (with DTD URI) is required
>                  * for SimpleXMLElement->asXML() method to produce proper XHTML tags.
>                  */
>                 $declarations = '<?xml version="1.0" encoding="UTF-8"?>';
>                 $declarations .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" ';
>                 $declarations .= '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">';
>                 $data['body'] = "$declarations$data[body]";
>                 if ($use_cache) {
>                     $cache->save(serialize($data), $cache_id);
461,495c471
<         /*
<          * Restore CDATAs and comments.
<          */
<         for ($i = 0; $i < $this->backup_count; $i++) {
<             $data['body'] = str_replace("<restore count=\"$i\" />", $this->backup[$i], $data['body']);
<         }
<         /*
<          * Use Tidy to format HTML if available.
<          * Otherwise, use HTMLParser class (is slower and consumes much memory).
<          */
<         if (extension_loaded('tidy')) {
<             $tidy = new tidy;
<             $tidy->parseString($data['body'], array('output-xhtml' => true), 'UTF8');
<             $tidy->cleanRepair();
<             $data['body'] = $tidy->html();
<         } else {
<             require_once 'HTMLParser.class.php';
<             $parser = new HTMLParser;
<             $format_rule = require 'xhtml1-transitional_dtd.inc.php';
<             $parser->setRule($format_rule);
<             $parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml'));
<             $parser->setGenericParent('body');
<             $parser->parse($data['body']);
<             $data['body'] = $parser->dump();
<         }
<         /*
<          * Valid XHTML DOCTYPE declaration (with DTD URI) is required
<          * for SimpleXMLElement->asXML() method to produce proper XHTML tags.
<          */
<         $declarations = '<?xml version="1.0" encoding="UTF-8"?>';
<         $declarations .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" ';
<         $declarations .= '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">';
<         $data['body'] = "$declarations$data[body]";
<
<         return $data ;
---
>         return $data;
496a473
>
	228c228
	< if (!preg_match('/^(https?\|file):\/\/+/i', $url)) {
	---
	> if (!preg_match('/^https?:\/\/\w[\w\-\.]+/i', $url)) {
	231d230
	<
	233,434c232,239
	< if (preg_match('/^https?:\/\/\w[\w\-\.]+/i', $url)) {
	< $cache_lifetime = (int) $cache_lifetime;
	< $use_cache = !empty($this->cacheDir) and $cache_lifetime > 0;
	< if ($use_cache) {
	< $cache = new Cache_Lite(array('cacheDir' => $this->cacheDir, 'lifeTime' => $cache_lifetime));
	< $params = array();
	< foreach ($headers as $key => $value) {
	< if (!empty($value)) {
	< $params[] = urlencode($key).'='.urlencode($value);
	< }
	< }
	< foreach ($post as $key => $value) {
	< $params[] = urlencode($key).'='.urlencode($value);
	< }
	< $cache_id = "$url?".implode('&', $params);
	< if (false !== $data = $cache->get($cache_id)) {
	< $data = unserialize($data);
	< }
	< }
	< /*
	< * Access to the URL if not cached
	< * or if the cache has either Last-Modified or Etag header
	< * and conditional request is specified.
	< */
	< if ($conditional_request and (!isset($data['headers']['last-modified']) or !isset($data['headers']['etag']))) {
	< $conditional_request = false;
	< }
	< if (!$data or $conditional_request) {
	< if (isset($data['headers']['last-modified'])
	< and (!isset($headers['last-modified']) or empty($headers['last-modified']))) {
	< $headers['last-modified'] = $data['headers']['last-modified'];
	< }
	< if (isset($data['headers']['etag'])
	< and (!isset($headers['etag']) or empty($headers['etag']))) {
	< $headers['etag'] = $data['headers']['etag'];
	< }
	< try {
	< $response = $this->getHttpResponse($url, $headers, $post);
	< } catch (Exception $e) {
	< if (!$data) {
	< throw $e;
	< }
	< }
	< /*
	< * Use cache if the responded HTTP status code is 304.
	< * If 200, format the responded HTML of the given URL to XHTML.
	< */
	< if (!$data or (isset($response['code']) and $response['code'] != 304)) {
	< $data =& $response;
	< /*
	< * If status code was 200 and Content-Type was not (X)HTML,
	< * the status code was forcibly altered to 204.
	< * @see HTTP_Request_Listener_Extended->update().
	< */
	< if ($data['code'] != 200 and $data['code'] != 204) {
	< throw new Exception("Responded HTTP Status Code is $data[code].");
	< } elseif (isset($data['headers']['content-type'])
	< and !preg_match('/^(?:text\|application)\/x?html\b/', $data['headers']['content-type'])) {
	< throw new Exception("Responded Content-Type is {$data['headers']['content-type']}");
	< } elseif (empty($data['body'])) {
	< throw new Exception("Responded entity body is empty.");
	< } elseif (!preg_match('/<\w+[^>]*?>/', $data['body'], $matches)) {
	< throw new Exception("Responded entity body does not contain a markup symbol.");
	< } elseif (false !== strpos($matches[0], "\x0")) {
	< throw new Exception("Responded entity body contains NULL.");
	< }
	<
	< $this->analyzeBody($data) ;
	<
	< if ($use_cache) {
	< $cache->save(serialize($data), $cache_id);
	< }
	< }
	< }
	< } else {
	< $path = substr($url,7);
	< if ( !file_exists($path) \|\| !is_readable($path) ) {
	< throw new Exception("File not exists.");
	< }
	<
	< $data = array() ;
	< $data['url'] = $url ;
	< $data['body'] = file_get_contents($path);
	<
	< $this->analyzeBody($data) ;
	< }
	<
	< return $data;
	< }
	<
	< private function analyzeBody(&$data)
	< {
	< /*
	< * Remove BOM and NULLs.
	< */
	< $data['body'] = preg_replace('/^\xef\xbb\xbf/', '' , $data['body']);
	< $data['body'] = str_replace("\x0", '', $data['body']);
	< /*
	< * Initialize the backups.
	< */
	< $this->backup = array();
	< $this->backup_count = 0;
	< /*
	< * Removing SCRIPT and STYLE is recommended.
	< * The following substitute code will capsulate the content of the tags in CDATA.
	< * If use it, be sure that some JavaScript method such as document.write
	< * is not compliant with XHTML/XML.
	< */
	< $tags = array('script', 'style');
	< foreach ($tags as $tag) {
	< $data['body'] = preg_replace("/<$tag\b[^>]?>.?<\/$tag\b[^>]*?>/si", '' , $data['body']);
	< /*
	< $data['body'] = preg_replace_callback(
	< "/(<$tag\b[^>]?>)(.?)(<\/$tag\b[^>]*?>)/si",
	< create_function('$matches', '
	< $content = trim($matches[2]);
	< if (empty($content)
	< or preg_match("/^<!\[CDATA\[.*?\]\]>$/s", $content)) {
	< return $matches[0];
	< } else {
	< $content = preg_replace("/^<!-+/", "", $content);
	< $content = preg_replace("/-+>$/", "", $content);
	< $content = preg_replace("/\s*\/\/$/s", "", trim($content));
	< return "$matches[1]<![CDATA[\n$content\n]]>$matches[3]";
	< }
	< '),
	< $data['body']
	< );
	< */
	< }
	< /*
	< * Backup CDATA sections for later process.
	< */
	< $data['body'] = preg_replace_callback(
	< '/<!\[CDATA\[.*?\]\]>/s', array($this, 'backup'), $data['body']
	< );
	< /*
	< * Comment section must not contain two or more adjacent hyphens.
	< */
	< $data['body'] = preg_replace_callback(
	< '/<!--(.*?)-->/si',
	< create_function('$matches', '
	< return "<!-- ".preg_replace("/-{2,}/", "-", $matches[1])." -->";
	< '),
	< $data['body']
	< );
	< /*
	< * Backup comment sections for later process.
	< */
	< $data['body'] = preg_replace_callback(
	< '/<!--.*?-->/s', array($this, 'backup'), $data['body']
	< );
	< /*
	< * Process tags that is potentially dangerous for XML parsers.
	< */
	< $data['body'] = preg_replace_callback(
	< '/(<textarea\b[^>]?>)(.?)(<\/textarea\b[^>]*?>)/si',
	< create_function('$matches', '
	< return $matches[1].str_replace("<", "<", $matches[2]).$matches[3];
	< '),
	< $data['body']
	< );
	< $data['body'] = preg_replace_callback(
	< '/<xmp\b[^>]?>(.?)<\/xmp\b[^>]*?>/si',
	< create_function('$matches', '
	< return "<pre>".str_replace("<", "<", $matches[1])."</pre>";
	< '),
	< $data['body']
	< );
	< $data['body'] = preg_replace_callback(
	< '/<plaintext\b[^>]?>(.)$/si',
	< create_function('$matches', '
	< return "<pre>".str_replace("<", "<", $matches[1])."</pre>";
	< '),
	< $data['body']
	< );
	< /*
	< * Remove DTD declarations, wrongly placed comments etc.
	< * This must be done before removing DOCTYPE.
	< */
	< $data['body'] = preg_replace('/<!(?!DOCTYPE)[^>]*?>/si', '', $data['body']);
	< /*
	< * XML and DOCTYPE declaration will be replaced.
	< */
	< $data['body'] = preg_replace('/<!DOCTYPE\b[^>]*?>/si', '', $data['body']);
	< $data['body'] = preg_replace('/<\?xml\b[^>]*?\?>/si', '', $data['body']);
	< if (preg_match('/^\s*$/s', $data['body'])) {
	< throw new Exception('The entity body became empty after preprocessing.');
	< }
	< /*
	< * Detect character encoding and convert to UTF-8.
	< */
	< $encoding = false;
	< if (isset($data['headers']['content-type'])) {
	< $encoding = $this->getCharsetFromCType($data['headers']['content-type']);
	< }
	< if (!$encoding and preg_match_all('/<meta\b[^>]*?>/si', $data['body'], $matches)) {
	< foreach ($matches[0] as $value) {
	< if (strtolower($this->getAttribute('http-equiv', $value)) == 'content-type'
	< and false !== $encoding = $this->getAttribute('content', $value)) {
	< $encoding = $this->getCharsetFromCType($encoding);
	< break;
	---
	> $cache_lifetime = (int) $cache_lifetime;
	> $use_cache = !empty($this->cacheDir) and $cache_lifetime > 0;
	> if ($use_cache) {
	> $cache = new Cache_Lite(array('cacheDir' => $this->cacheDir, 'lifeTime' => $cache_lifetime));
	> $params = array();
	> foreach ($headers as $key => $value) {
	> if (!empty($value)) {
	> $params[] = urlencode($key).'='.urlencode($value);
	436a242,248
	> foreach ($post as $key => $value) {
	> $params[] = urlencode($key).'='.urlencode($value);
	> }
	> $cache_id = "$url?".implode('&', $params);
	> if (false !== $data = $cache->get($cache_id)) {
	> $data = unserialize($data);
	> }
	439,441c251,253
	< * Use mbstring to convert character encoding if available.
	< * Otherwise use iconv (iconv may try to detect character encoding automatically).
	< * Do not trust the declared encoding and do conversion even if UTF-8.
	---
	> * Access to the URL if not cached
	> * or if the cache has either Last-Modified or Etag header
	> * and conditional request is specified.
	443,448c255,261
	< if (extension_loaded('mbstring')) {
	< if (!$encoding) {
	< @mb_detect_order('ASCII, JIS, UTF-8, EUC-JP, SJIS');
	< if (false === $encoding = @mb_preferred_mime_name(@mb_detect_encoding($data['body']))) {
	< throw new Exception('Failed detecting character encoding.');
	< }
	---
	> if ($conditional_request and (!isset($data['headers']['last-modified']) or !isset($data['headers']['etag']))) {
	> $conditional_request = false;
	> }
	> if (!$data or $conditional_request) {
	> if (isset($data['headers']['last-modified'])
	> and (!isset($headers['last-modified']) or empty($headers['last-modified']))) {
	> $headers['last-modified'] = $data['headers']['last-modified'];
	450,453c263,265
	< @mb_convert_variables('UTF-8', $encoding, $data, $this->backup);
	< } else {
	< if (false === $data['body'] = @iconv($encoding, 'UTF-8', $data['body'])) {
	< throw new Exception('Failed converting character encoding.');
	---
	> if (isset($data['headers']['etag'])
	> and (!isset($headers['etag']) or empty($headers['etag']))) {
	> $headers['etag'] = $data['headers']['etag'];
	455,457c267,467
	< foreach ($this->backup as $key => $value) {
	< if (false === $this->backup[$key] = @iconv($encoding, 'UTF-8', $value)) {
	< throw new Exception('Failed converting character encoding.');
	---
	> try {
	> $response = $this->getHttpResponse($url, $headers, $post);
	> } catch (Exception $e) {
	> if (!$data) {
	> throw $e;
	> }
	> }
	> /*
	> * Use cache if the responded HTTP status code is 304.
	> * If 200, format the responded HTML of the given URL to XHTML.
	> */
	> if (!$data or (isset($response['code']) and $response['code'] != 304)) {
	> $data =& $response;
	> /*
	> * If status code was 200 and Content-Type was not (X)HTML,
	> * the status code was forcibly altered to 204.
	> * @see HTTP_Request_Listener_Extended->update().
	> */
	> if ($data['code'] != 200 and $data['code'] != 204) {
	> throw new Exception("Responded HTTP Status Code is $data[code].");
	> } elseif (isset($data['headers']['content-type'])
	> and !preg_match('/^(?:text\|application)\/x?html\b/', $data['headers']['content-type'])) {
	> throw new Exception("Responded Content-Type is {$data['headers']['content-type']}");
	> } elseif (empty($data['body'])) {
	> throw new Exception("Responded entity body is empty.");
	> } elseif (!preg_match('/<\w+[^>]*?>/', $data['body'], $matches)) {
	> throw new Exception("Responded entity body does not contain a markup symbol.");
	> } elseif (false !== strpos($matches[0], "\x0")) {
	> throw new Exception("Responded entity body contains NULL.");
	> }
	> /*
	> * Remove BOM and NULLs.
	> */
	> $data['body'] = preg_replace('/^\xef\xbb\xbf/', '' , $data['body']);
	> $data['body'] = str_replace("\x0", '', $data['body']);
	> /*
	> * Initialize the backups.
	> */
	> $this->backup = array();
	> $this->backup_count = 0;
	> /*
	> * Removing SCRIPT and STYLE is recommended.
	> * The following substitute code will capsulate the content of the tags in CDATA.
	> * If use it, be sure that some JavaScript method such as document.write
	> * is not compliant with XHTML/XML.
	> */
	> $tags = array('script', 'style');
	> foreach ($tags as $tag) {
	> $data['body'] = preg_replace("/<$tag\b[^>]?>.?<\/$tag\b[^>]*?>/si", '' , $data['body']);
	> /*
	> $data['body'] = preg_replace_callback(
	> "/(<$tag\b[^>]?>)(.?)(<\/$tag\b[^>]*?>)/si",
	> create_function('$matches', '
	> $content = trim($matches[2]);
	> if (empty($content)
	> or preg_match("/^<!\[CDATA\[.*?\]\]>$/s", $content)) {
	> return $matches[0];
	> } else {
	> $content = preg_replace("/^<!-+/", "", $content);
	> $content = preg_replace("/-+>$/", "", $content);
	> $content = preg_replace("/\s*\/\/$/s", "", trim($content));
	> return "$matches[1]<![CDATA[\n$content\n]]>$matches[3]";
	> }
	> '),
	> $data['body']
	> );
	> */
	> }
	> /*
	> * Backup CDATA sections for later process.
	> */
	> $data['body'] = preg_replace_callback(
	> '/<!\[CDATA\[.*?\]\]>/s', array($this, 'backup'), $data['body']
	> );
	> /*
	> * Comment section must not contain two or more adjacent hyphens.
	> */
	> $data['body'] = preg_replace_callback(
	> '/<!--(.*?)-->/si',
	> create_function('$matches', '
	> return "<!-- ".preg_replace("/-{2,}/", "-", $matches[1])." -->";
	> '),
	> $data['body']
	> );
	> /*
	> * Backup comment sections for later process.
	> */
	> $data['body'] = preg_replace_callback(
	> '/<!--.*?-->/s', array($this, 'backup'), $data['body']
	> );
	> /*
	> * Process tags that is potentially dangerous for XML parsers.
	> */
	> $data['body'] = preg_replace_callback(
	> '/(<textarea\b[^>]?>)(.?)(<\/textarea\b[^>]*?>)/si',
	> create_function('$matches', '
	> return $matches[1].str_replace("<", "<", $matches[2]).$matches[3];
	> '),
	> $data['body']
	> );
	> $data['body'] = preg_replace_callback(
	> '/<xmp\b[^>]?>(.?)<\/xmp\b[^>]*?>/si',
	> create_function('$matches', '
	> return "<pre>".str_replace("<", "<", $matches[1])."</pre>";
	> '),
	> $data['body']
	> );
	> $data['body'] = preg_replace_callback(
	> '/<plaintext\b[^>]?>(.)$/si',
	> create_function('$matches', '
	> return "<pre>".str_replace("<", "<", $matches[1])."</pre>";
	> '),
	> $data['body']
	> );
	> /*
	> * Remove DTD declarations, wrongly placed comments etc.
	> * This must be done before removing DOCTYPE.
	> */
	> $data['body'] = preg_replace('/<!(?!DOCTYPE)[^>]*?>/si', '', $data['body']);
	> /*
	> * XML and DOCTYPE declaration will be replaced.
	> */
	> $data['body'] = preg_replace('/<!DOCTYPE\b[^>]*?>/si', '', $data['body']);
	> $data['body'] = preg_replace('/<\?xml\b[^>]*?\?>/si', '', $data['body']);
	> if (preg_match('/^\s*$/s', $data['body'])) {
	> throw new Exception('The entity body became empty after preprocessing.');
	> }
	> /*
	> * Detect character encoding and convert to UTF-8.
	> */
	> $encoding = false;
	> if (isset($data['headers']['content-type'])) {
	> $encoding = $this->getCharsetFromCType($data['headers']['content-type']);
	> }
	> if (!$encoding and preg_match_all('/<meta\b[^>]*?>/si', $data['body'], $matches)) {
	> foreach ($matches[0] as $value) {
	> if (strtolower($this->getAttribute('http-equiv', $value)) == 'content-type'
	> and false !== $encoding = $this->getAttribute('content', $value)) {
	> $encoding = $this->getCharsetFromCType($encoding);
	> break;
	> }
	> }
	> }
	> /*
	> * Use mbstring to convert character encoding if available.
	> * Otherwise use iconv (iconv may try to detect character encoding automatically).
	> * Do not trust the declared encoding and do conversion even if UTF-8.
	> */
	> if (extension_loaded('mbstring')) {
	> if (!$encoding) {
	> @mb_detect_order('ASCII, JIS, UTF-8, EUC-JP, SJIS');
	> if (false === $encoding = @mb_preferred_mime_name(@mb_detect_encoding($data['body']))) {
	> throw new Exception('Failed detecting character encoding.');
	> }
	> }
	> @mb_convert_variables('UTF-8', $encoding, $data, $this->backup);
	> } else {
	> if (false === $data['body'] = @iconv($encoding, 'UTF-8', $data['body'])) {
	> throw new Exception('Failed converting character encoding.');
	> }
	> foreach ($this->backup as $key => $value) {
	> if (false === $this->backup[$key] = @iconv($encoding, 'UTF-8', $value)) {
	> throw new Exception('Failed converting character encoding.');
	> }
	> }
	> }
	> /*
	> * Restore CDATAs and comments.
	> */
	> for ($i = 0; $i < $this->backup_count; $i++) {
	> $data['body'] = str_replace("<restore count=\"$i\" />", $this->backup[$i], $data['body']);
	> }
	> /*
	> * Use Tidy to format HTML if available.
	> * Otherwise, use HTMLParser class (is slower and consumes much memory).
	> */
	> if (extension_loaded('tidy')) {
	> $tidy = new tidy;
	> $tidy->parseString($data['body'], array('output-xhtml' => true), 'UTF8');
	> $tidy->cleanRepair();
	> $data['body'] = $tidy->html();
	> } else {
	> require_once 'HTMLParser.class.php';
	> $parser = new HTMLParser;
	> $format_rule = require 'xhtml1-transitional_dtd.inc.php';
	> $parser->setRule($format_rule);
	> $parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml'));
	> $parser->setGenericParent('body');
	> $parser->parse($data['body']);
	> $data['body'] = $parser->dump();
	> }
	> /*
	> * Valid XHTML DOCTYPE declaration (with DTD URI) is required
	> * for SimpleXMLElement->asXML() method to produce proper XHTML tags.
	> */
	> $declarations = '<?xml version="1.0" encoding="UTF-8"?>';
	> $declarations .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" ';
	> $declarations .= '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">';
	> $data['body'] = "$declarations$data[body]";
	> if ($use_cache) {
	> $cache->save(serialize($data), $cache_id);
	461,495c471
	< /*
	< * Restore CDATAs and comments.
	< */
	< for ($i = 0; $i < $this->backup_count; $i++) {
	< $data['body'] = str_replace("<restore count=\"$i\" />", $this->backup[$i], $data['body']);
	< }
	< /*
	< * Use Tidy to format HTML if available.
	< * Otherwise, use HTMLParser class (is slower and consumes much memory).
	< */
	< if (extension_loaded('tidy')) {
	< $tidy = new tidy;
	< $tidy->parseString($data['body'], array('output-xhtml' => true), 'UTF8');
	< $tidy->cleanRepair();
	< $data['body'] = $tidy->html();
	< } else {
	< require_once 'HTMLParser.class.php';
	< $parser = new HTMLParser;
	< $format_rule = require 'xhtml1-transitional_dtd.inc.php';
	< $parser->setRule($format_rule);
	< $parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml'));
	< $parser->setGenericParent('body');
	< $parser->parse($data['body']);
	< $data['body'] = $parser->dump();
	< }
	< /*
	< * Valid XHTML DOCTYPE declaration (with DTD URI) is required
	< * for SimpleXMLElement->asXML() method to produce proper XHTML tags.
	< */
	< $declarations = '<?xml version="1.0" encoding="UTF-8"?>';
	< $declarations .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" ';
	< $declarations .= '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">';
	< $data['body'] = "$declarations$data[body]";
	<
	< return $data ;
	---
	> return $data;
	496a473
	>