Skip to content

Instantly share code, notes, and snippets.

@kaz29
Created November 30, 2010 01:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kaz29/720986 to your computer and use it in GitHub Desktop.
Save kaz29/720986 to your computer and use it in GitHub Desktop.
HTMLScraping file schema support patch(http://www.rcdtokyo.com/etc/htmlscraping/)
228c228
< if (!preg_match('/^(https?|file):\/\/+/i', $url)) {
---
> if (!preg_match('/^https?:\/\/\w[\w\-\.]+/i', $url)) {
231d230
<
233,434c232,239
< if (preg_match('/^https?:\/\/\w[\w\-\.]+/i', $url)) {
< $cache_lifetime = (int) $cache_lifetime;
< $use_cache = !empty($this->cacheDir) and $cache_lifetime > 0;
< if ($use_cache) {
< $cache = new Cache_Lite(array('cacheDir' => $this->cacheDir, 'lifeTime' => $cache_lifetime));
< $params = array();
< foreach ($headers as $key => $value) {
< if (!empty($value)) {
< $params[] = urlencode($key).'='.urlencode($value);
< }
< }
< foreach ($post as $key => $value) {
< $params[] = urlencode($key).'='.urlencode($value);
< }
< $cache_id = "$url?".implode('&', $params);
< if (false !== $data = $cache->get($cache_id)) {
< $data = unserialize($data);
< }
< }
< /*
< * Access to the URL if not cached
< * or if the cache has either Last-Modified or Etag header
< * and conditional request is specified.
< */
< if ($conditional_request and (!isset($data['headers']['last-modified']) or !isset($data['headers']['etag']))) {
< $conditional_request = false;
< }
< if (!$data or $conditional_request) {
< if (isset($data['headers']['last-modified'])
< and (!isset($headers['last-modified']) or empty($headers['last-modified']))) {
< $headers['last-modified'] = $data['headers']['last-modified'];
< }
< if (isset($data['headers']['etag'])
< and (!isset($headers['etag']) or empty($headers['etag']))) {
< $headers['etag'] = $data['headers']['etag'];
< }
< try {
< $response = $this->getHttpResponse($url, $headers, $post);
< } catch (Exception $e) {
< if (!$data) {
< throw $e;
< }
< }
< /*
< * Use cache if the responded HTTP status code is 304.
< * If 200, format the responded HTML of the given URL to XHTML.
< */
< if (!$data or (isset($response['code']) and $response['code'] != 304)) {
< $data =& $response;
< /*
< * If status code was 200 and Content-Type was not (X)HTML,
< * the status code was forcibly altered to 204.
< * @see HTTP_Request_Listener_Extended->update().
< */
< if ($data['code'] != 200 and $data['code'] != 204) {
< throw new Exception("Responded HTTP Status Code is $data[code].");
< } elseif (isset($data['headers']['content-type'])
< and !preg_match('/^(?:text|application)\/x?html\b/', $data['headers']['content-type'])) {
< throw new Exception("Responded Content-Type is {$data['headers']['content-type']}");
< } elseif (empty($data['body'])) {
< throw new Exception("Responded entity body is empty.");
< } elseif (!preg_match('/<\w+[^>]*?>/', $data['body'], $matches)) {
< throw new Exception("Responded entity body does not contain a markup symbol.");
< } elseif (false !== strpos($matches[0], "\x0")) {
< throw new Exception("Responded entity body contains NULL.");
< }
<
< $this->analyzeBody($data) ;
<
< if ($use_cache) {
< $cache->save(serialize($data), $cache_id);
< }
< }
< }
< } else {
< $path = substr($url,7);
< if ( !file_exists($path) || !is_readable($path) ) {
< throw new Exception("File not exists.");
< }
<
< $data = array() ;
< $data['url'] = $url ;
< $data['body'] = file_get_contents($path);
<
< $this->analyzeBody($data) ;
< }
<
< return $data;
< }
<
< private function analyzeBody(&$data)
< {
< /*
< * Remove BOM and NULLs.
< */
< $data['body'] = preg_replace('/^\xef\xbb\xbf/', '' , $data['body']);
< $data['body'] = str_replace("\x0", '', $data['body']);
< /*
< * Initialize the backups.
< */
< $this->backup = array();
< $this->backup_count = 0;
< /*
< * Removing SCRIPT and STYLE is recommended.
< * The following substitute code will capsulate the content of the tags in CDATA.
< * If use it, be sure that some JavaScript method such as document.write
< * is not compliant with XHTML/XML.
< */
< $tags = array('script', 'style');
< foreach ($tags as $tag) {
< $data['body'] = preg_replace("/<$tag\b[^>]*?>.*?<\/$tag\b[^>]*?>/si", '' , $data['body']);
< /*
< $data['body'] = preg_replace_callback(
< "/(<$tag\b[^>]*?>)(.*?)(<\/$tag\b[^>]*?>)/si",
< create_function('$matches', '
< $content = trim($matches[2]);
< if (empty($content)
< or preg_match("/^<!\[CDATA\[.*?\]\]>$/s", $content)) {
< return $matches[0];
< } else {
< $content = preg_replace("/^<!-+/", "", $content);
< $content = preg_replace("/-+>$/", "", $content);
< $content = preg_replace("/\s*\/\/$/s", "", trim($content));
< return "$matches[1]<![CDATA[\n$content\n]]>$matches[3]";
< }
< '),
< $data['body']
< );
< */
< }
< /*
< * Backup CDATA sections for later process.
< */
< $data['body'] = preg_replace_callback(
< '/<!\[CDATA\[.*?\]\]>/s', array($this, 'backup'), $data['body']
< );
< /*
< * Comment section must not contain two or more adjacent hyphens.
< */
< $data['body'] = preg_replace_callback(
< '/<!--(.*?)-->/si',
< create_function('$matches', '
< return "<!-- ".preg_replace("/-{2,}/", "-", $matches[1])." -->";
< '),
< $data['body']
< );
< /*
< * Backup comment sections for later process.
< */
< $data['body'] = preg_replace_callback(
< '/<!--.*?-->/s', array($this, 'backup'), $data['body']
< );
< /*
< * Process tags that is potentially dangerous for XML parsers.
< */
< $data['body'] = preg_replace_callback(
< '/(<textarea\b[^>]*?>)(.*?)(<\/textarea\b[^>]*?>)/si',
< create_function('$matches', '
< return $matches[1].str_replace("<", "&lt;", $matches[2]).$matches[3];
< '),
< $data['body']
< );
< $data['body'] = preg_replace_callback(
< '/<xmp\b[^>]*?>(.*?)<\/xmp\b[^>]*?>/si',
< create_function('$matches', '
< return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
< '),
< $data['body']
< );
< $data['body'] = preg_replace_callback(
< '/<plaintext\b[^>]*?>(.*)$/si',
< create_function('$matches', '
< return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
< '),
< $data['body']
< );
< /*
< * Remove DTD declarations, wrongly placed comments etc.
< * This must be done before removing DOCTYPE.
< */
< $data['body'] = preg_replace('/<!(?!DOCTYPE)[^>]*?>/si', '', $data['body']);
< /*
< * XML and DOCTYPE declaration will be replaced.
< */
< $data['body'] = preg_replace('/<!DOCTYPE\b[^>]*?>/si', '', $data['body']);
< $data['body'] = preg_replace('/<\?xml\b[^>]*?\?>/si', '', $data['body']);
< if (preg_match('/^\s*$/s', $data['body'])) {
< throw new Exception('The entity body became empty after preprocessing.');
< }
< /*
< * Detect character encoding and convert to UTF-8.
< */
< $encoding = false;
< if (isset($data['headers']['content-type'])) {
< $encoding = $this->getCharsetFromCType($data['headers']['content-type']);
< }
< if (!$encoding and preg_match_all('/<meta\b[^>]*?>/si', $data['body'], $matches)) {
< foreach ($matches[0] as $value) {
< if (strtolower($this->getAttribute('http-equiv', $value)) == 'content-type'
< and false !== $encoding = $this->getAttribute('content', $value)) {
< $encoding = $this->getCharsetFromCType($encoding);
< break;
---
> $cache_lifetime = (int) $cache_lifetime;
> $use_cache = !empty($this->cacheDir) and $cache_lifetime > 0;
> if ($use_cache) {
> $cache = new Cache_Lite(array('cacheDir' => $this->cacheDir, 'lifeTime' => $cache_lifetime));
> $params = array();
> foreach ($headers as $key => $value) {
> if (!empty($value)) {
> $params[] = urlencode($key).'='.urlencode($value);
436a242,248
> foreach ($post as $key => $value) {
> $params[] = urlencode($key).'='.urlencode($value);
> }
> $cache_id = "$url?".implode('&', $params);
> if (false !== $data = $cache->get($cache_id)) {
> $data = unserialize($data);
> }
439,441c251,253
< * Use mbstring to convert character encoding if available.
< * Otherwise use iconv (iconv may try to detect character encoding automatically).
< * Do not trust the declared encoding and do conversion even if UTF-8.
---
> * Access to the URL if not cached
> * or if the cache has either Last-Modified or Etag header
> * and conditional request is specified.
443,448c255,261
< if (extension_loaded('mbstring')) {
< if (!$encoding) {
< @mb_detect_order('ASCII, JIS, UTF-8, EUC-JP, SJIS');
< if (false === $encoding = @mb_preferred_mime_name(@mb_detect_encoding($data['body']))) {
< throw new Exception('Failed detecting character encoding.');
< }
---
> if ($conditional_request and (!isset($data['headers']['last-modified']) or !isset($data['headers']['etag']))) {
> $conditional_request = false;
> }
> if (!$data or $conditional_request) {
> if (isset($data['headers']['last-modified'])
> and (!isset($headers['last-modified']) or empty($headers['last-modified']))) {
> $headers['last-modified'] = $data['headers']['last-modified'];
450,453c263,265
< @mb_convert_variables('UTF-8', $encoding, $data, $this->backup);
< } else {
< if (false === $data['body'] = @iconv($encoding, 'UTF-8', $data['body'])) {
< throw new Exception('Failed converting character encoding.');
---
> if (isset($data['headers']['etag'])
> and (!isset($headers['etag']) or empty($headers['etag']))) {
> $headers['etag'] = $data['headers']['etag'];
455,457c267,467
< foreach ($this->backup as $key => $value) {
< if (false === $this->backup[$key] = @iconv($encoding, 'UTF-8', $value)) {
< throw new Exception('Failed converting character encoding.');
---
> try {
> $response = $this->getHttpResponse($url, $headers, $post);
> } catch (Exception $e) {
> if (!$data) {
> throw $e;
> }
> }
> /*
> * Use cache if the responded HTTP status code is 304.
> * If 200, format the responded HTML of the given URL to XHTML.
> */
> if (!$data or (isset($response['code']) and $response['code'] != 304)) {
> $data =& $response;
> /*
> * If status code was 200 and Content-Type was not (X)HTML,
> * the status code was forcibly altered to 204.
> * @see HTTP_Request_Listener_Extended->update().
> */
> if ($data['code'] != 200 and $data['code'] != 204) {
> throw new Exception("Responded HTTP Status Code is $data[code].");
> } elseif (isset($data['headers']['content-type'])
> and !preg_match('/^(?:text|application)\/x?html\b/', $data['headers']['content-type'])) {
> throw new Exception("Responded Content-Type is {$data['headers']['content-type']}");
> } elseif (empty($data['body'])) {
> throw new Exception("Responded entity body is empty.");
> } elseif (!preg_match('/<\w+[^>]*?>/', $data['body'], $matches)) {
> throw new Exception("Responded entity body does not contain a markup symbol.");
> } elseif (false !== strpos($matches[0], "\x0")) {
> throw new Exception("Responded entity body contains NULL.");
> }
> /*
> * Remove BOM and NULLs.
> */
> $data['body'] = preg_replace('/^\xef\xbb\xbf/', '' , $data['body']);
> $data['body'] = str_replace("\x0", '', $data['body']);
> /*
> * Initialize the backups.
> */
> $this->backup = array();
> $this->backup_count = 0;
> /*
> * Removing SCRIPT and STYLE is recommended.
> * The following substitute code will capsulate the content of the tags in CDATA.
> * If use it, be sure that some JavaScript method such as document.write
> * is not compliant with XHTML/XML.
> */
> $tags = array('script', 'style');
> foreach ($tags as $tag) {
> $data['body'] = preg_replace("/<$tag\b[^>]*?>.*?<\/$tag\b[^>]*?>/si", '' , $data['body']);
> /*
> $data['body'] = preg_replace_callback(
> "/(<$tag\b[^>]*?>)(.*?)(<\/$tag\b[^>]*?>)/si",
> create_function('$matches', '
> $content = trim($matches[2]);
> if (empty($content)
> or preg_match("/^<!\[CDATA\[.*?\]\]>$/s", $content)) {
> return $matches[0];
> } else {
> $content = preg_replace("/^<!-+/", "", $content);
> $content = preg_replace("/-+>$/", "", $content);
> $content = preg_replace("/\s*\/\/$/s", "", trim($content));
> return "$matches[1]<![CDATA[\n$content\n]]>$matches[3]";
> }
> '),
> $data['body']
> );
> */
> }
> /*
> * Backup CDATA sections for later process.
> */
> $data['body'] = preg_replace_callback(
> '/<!\[CDATA\[.*?\]\]>/s', array($this, 'backup'), $data['body']
> );
> /*
> * Comment section must not contain two or more adjacent hyphens.
> */
> $data['body'] = preg_replace_callback(
> '/<!--(.*?)-->/si',
> create_function('$matches', '
> return "<!-- ".preg_replace("/-{2,}/", "-", $matches[1])." -->";
> '),
> $data['body']
> );
> /*
> * Backup comment sections for later process.
> */
> $data['body'] = preg_replace_callback(
> '/<!--.*?-->/s', array($this, 'backup'), $data['body']
> );
> /*
> * Process tags that is potentially dangerous for XML parsers.
> */
> $data['body'] = preg_replace_callback(
> '/(<textarea\b[^>]*?>)(.*?)(<\/textarea\b[^>]*?>)/si',
> create_function('$matches', '
> return $matches[1].str_replace("<", "&lt;", $matches[2]).$matches[3];
> '),
> $data['body']
> );
> $data['body'] = preg_replace_callback(
> '/<xmp\b[^>]*?>(.*?)<\/xmp\b[^>]*?>/si',
> create_function('$matches', '
> return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
> '),
> $data['body']
> );
> $data['body'] = preg_replace_callback(
> '/<plaintext\b[^>]*?>(.*)$/si',
> create_function('$matches', '
> return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
> '),
> $data['body']
> );
> /*
> * Remove DTD declarations, wrongly placed comments etc.
> * This must be done before removing DOCTYPE.
> */
> $data['body'] = preg_replace('/<!(?!DOCTYPE)[^>]*?>/si', '', $data['body']);
> /*
> * XML and DOCTYPE declaration will be replaced.
> */
> $data['body'] = preg_replace('/<!DOCTYPE\b[^>]*?>/si', '', $data['body']);
> $data['body'] = preg_replace('/<\?xml\b[^>]*?\?>/si', '', $data['body']);
> if (preg_match('/^\s*$/s', $data['body'])) {
> throw new Exception('The entity body became empty after preprocessing.');
> }
> /*
> * Detect character encoding and convert to UTF-8.
> */
> $encoding = false;
> if (isset($data['headers']['content-type'])) {
> $encoding = $this->getCharsetFromCType($data['headers']['content-type']);
> }
> if (!$encoding and preg_match_all('/<meta\b[^>]*?>/si', $data['body'], $matches)) {
> foreach ($matches[0] as $value) {
> if (strtolower($this->getAttribute('http-equiv', $value)) == 'content-type'
> and false !== $encoding = $this->getAttribute('content', $value)) {
> $encoding = $this->getCharsetFromCType($encoding);
> break;
> }
> }
> }
> /*
> * Use mbstring to convert character encoding if available.
> * Otherwise use iconv (iconv may try to detect character encoding automatically).
> * Do not trust the declared encoding and do conversion even if UTF-8.
> */
> if (extension_loaded('mbstring')) {
> if (!$encoding) {
> @mb_detect_order('ASCII, JIS, UTF-8, EUC-JP, SJIS');
> if (false === $encoding = @mb_preferred_mime_name(@mb_detect_encoding($data['body']))) {
> throw new Exception('Failed detecting character encoding.');
> }
> }
> @mb_convert_variables('UTF-8', $encoding, $data, $this->backup);
> } else {
> if (false === $data['body'] = @iconv($encoding, 'UTF-8', $data['body'])) {
> throw new Exception('Failed converting character encoding.');
> }
> foreach ($this->backup as $key => $value) {
> if (false === $this->backup[$key] = @iconv($encoding, 'UTF-8', $value)) {
> throw new Exception('Failed converting character encoding.');
> }
> }
> }
> /*
> * Restore CDATAs and comments.
> */
> for ($i = 0; $i < $this->backup_count; $i++) {
> $data['body'] = str_replace("<restore count=\"$i\" />", $this->backup[$i], $data['body']);
> }
> /*
> * Use Tidy to format HTML if available.
> * Otherwise, use HTMLParser class (is slower and consumes much memory).
> */
> if (extension_loaded('tidy')) {
> $tidy = new tidy;
> $tidy->parseString($data['body'], array('output-xhtml' => true), 'UTF8');
> $tidy->cleanRepair();
> $data['body'] = $tidy->html();
> } else {
> require_once 'HTMLParser.class.php';
> $parser = new HTMLParser;
> $format_rule = require 'xhtml1-transitional_dtd.inc.php';
> $parser->setRule($format_rule);
> $parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml'));
> $parser->setGenericParent('body');
> $parser->parse($data['body']);
> $data['body'] = $parser->dump();
> }
> /*
> * Valid XHTML DOCTYPE declaration (with DTD URI) is required
> * for SimpleXMLElement->asXML() method to produce proper XHTML tags.
> */
> $declarations = '<?xml version="1.0" encoding="UTF-8"?>';
> $declarations .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" ';
> $declarations .= '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">';
> $data['body'] = "$declarations$data[body]";
> if ($use_cache) {
> $cache->save(serialize($data), $cache_id);
461,495c471
< /*
< * Restore CDATAs and comments.
< */
< for ($i = 0; $i < $this->backup_count; $i++) {
< $data['body'] = str_replace("<restore count=\"$i\" />", $this->backup[$i], $data['body']);
< }
< /*
< * Use Tidy to format HTML if available.
< * Otherwise, use HTMLParser class (is slower and consumes much memory).
< */
< if (extension_loaded('tidy')) {
< $tidy = new tidy;
< $tidy->parseString($data['body'], array('output-xhtml' => true), 'UTF8');
< $tidy->cleanRepair();
< $data['body'] = $tidy->html();
< } else {
< require_once 'HTMLParser.class.php';
< $parser = new HTMLParser;
< $format_rule = require 'xhtml1-transitional_dtd.inc.php';
< $parser->setRule($format_rule);
< $parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml'));
< $parser->setGenericParent('body');
< $parser->parse($data['body']);
< $data['body'] = $parser->dump();
< }
< /*
< * Valid XHTML DOCTYPE declaration (with DTD URI) is required
< * for SimpleXMLElement->asXML() method to produce proper XHTML tags.
< */
< $declarations = '<?xml version="1.0" encoding="UTF-8"?>';
< $declarations .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" ';
< $declarations .= '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">';
< $data['body'] = "$declarations$data[body]";
<
< return $data ;
---
> return $data;
496a473
>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment