Created
November 30, 2010 01:45
-
-
Save kaz29/720986 to your computer and use it in GitHub Desktop.
HTMLScraping file schema support patch(http://www.rcdtokyo.com/etc/htmlscraping/)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
228c228 | |
< if (!preg_match('/^(https?|file):\/\/+/i', $url)) { | |
--- | |
> if (!preg_match('/^https?:\/\/\w[\w\-\.]+/i', $url)) { | |
231d230 | |
< | |
233,434c232,239 | |
< if (preg_match('/^https?:\/\/\w[\w\-\.]+/i', $url)) { | |
< $cache_lifetime = (int) $cache_lifetime; | |
< $use_cache = !empty($this->cacheDir) and $cache_lifetime > 0; | |
< if ($use_cache) { | |
< $cache = new Cache_Lite(array('cacheDir' => $this->cacheDir, 'lifeTime' => $cache_lifetime)); | |
< $params = array(); | |
< foreach ($headers as $key => $value) { | |
< if (!empty($value)) { | |
< $params[] = urlencode($key).'='.urlencode($value); | |
< } | |
< } | |
< foreach ($post as $key => $value) { | |
< $params[] = urlencode($key).'='.urlencode($value); | |
< } | |
< $cache_id = "$url?".implode('&', $params); | |
< if (false !== $data = $cache->get($cache_id)) { | |
< $data = unserialize($data); | |
< } | |
< } | |
< /* | |
< * Access to the URL if not cached | |
< * or if the cache has either Last-Modified or Etag header | |
< * and conditional request is specified. | |
< */ | |
< if ($conditional_request and (!isset($data['headers']['last-modified']) or !isset($data['headers']['etag']))) { | |
< $conditional_request = false; | |
< } | |
< if (!$data or $conditional_request) { | |
< if (isset($data['headers']['last-modified']) | |
< and (!isset($headers['last-modified']) or empty($headers['last-modified']))) { | |
< $headers['last-modified'] = $data['headers']['last-modified']; | |
< } | |
< if (isset($data['headers']['etag']) | |
< and (!isset($headers['etag']) or empty($headers['etag']))) { | |
< $headers['etag'] = $data['headers']['etag']; | |
< } | |
< try { | |
< $response = $this->getHttpResponse($url, $headers, $post); | |
< } catch (Exception $e) { | |
< if (!$data) { | |
< throw $e; | |
< } | |
< } | |
< /* | |
< * Use cache if the responded HTTP status code is 304. | |
< * If 200, format the responded HTML of the given URL to XHTML. | |
< */ | |
< if (!$data or (isset($response['code']) and $response['code'] != 304)) { | |
< $data =& $response; | |
< /* | |
< * If status code was 200 and Content-Type was not (X)HTML, | |
< * the status code was forcibly altered to 204. | |
< * @see HTTP_Request_Listener_Extended->update(). | |
< */ | |
< if ($data['code'] != 200 and $data['code'] != 204) { | |
< throw new Exception("Responded HTTP Status Code is $data[code]."); | |
< } elseif (isset($data['headers']['content-type']) | |
< and !preg_match('/^(?:text|application)\/x?html\b/', $data['headers']['content-type'])) { | |
< throw new Exception("Responded Content-Type is {$data['headers']['content-type']}"); | |
< } elseif (empty($data['body'])) { | |
< throw new Exception("Responded entity body is empty."); | |
< } elseif (!preg_match('/<\w+[^>]*?>/', $data['body'], $matches)) { | |
< throw new Exception("Responded entity body does not contain a markup symbol."); | |
< } elseif (false !== strpos($matches[0], "\x0")) { | |
< throw new Exception("Responded entity body contains NULL."); | |
< } | |
< | |
< $this->analyzeBody($data) ; | |
< | |
< if ($use_cache) { | |
< $cache->save(serialize($data), $cache_id); | |
< } | |
< } | |
< } | |
< } else { | |
< $path = substr($url,7); | |
< if ( !file_exists($path) || !is_readable($path) ) { | |
< throw new Exception("File not exists."); | |
< } | |
< | |
< $data = array() ; | |
< $data['url'] = $url ; | |
< $data['body'] = file_get_contents($path); | |
< | |
< $this->analyzeBody($data) ; | |
< } | |
< | |
< return $data; | |
< } | |
< | |
< private function analyzeBody(&$data) | |
< { | |
< /* | |
< * Remove BOM and NULLs. | |
< */ | |
< $data['body'] = preg_replace('/^\xef\xbb\xbf/', '' , $data['body']); | |
< $data['body'] = str_replace("\x0", '', $data['body']); | |
< /* | |
< * Initialize the backups. | |
< */ | |
< $this->backup = array(); | |
< $this->backup_count = 0; | |
< /* | |
< * Removing SCRIPT and STYLE is recommended. | |
< * The following substitute code will capsulate the content of the tags in CDATA. | |
< * If use it, be sure that some JavaScript method such as document.write | |
< * is not compliant with XHTML/XML. | |
< */ | |
< $tags = array('script', 'style'); | |
< foreach ($tags as $tag) { | |
< $data['body'] = preg_replace("/<$tag\b[^>]*?>.*?<\/$tag\b[^>]*?>/si", '' , $data['body']); | |
< /* | |
< $data['body'] = preg_replace_callback( | |
< "/(<$tag\b[^>]*?>)(.*?)(<\/$tag\b[^>]*?>)/si", | |
< create_function('$matches', ' | |
< $content = trim($matches[2]); | |
< if (empty($content) | |
< or preg_match("/^<!\[CDATA\[.*?\]\]>$/s", $content)) { | |
< return $matches[0]; | |
< } else { | |
< $content = preg_replace("/^<!-+/", "", $content); | |
< $content = preg_replace("/-+>$/", "", $content); | |
< $content = preg_replace("/\s*\/\/$/s", "", trim($content)); | |
< return "$matches[1]<![CDATA[\n$content\n]]>$matches[3]"; | |
< } | |
< '), | |
< $data['body'] | |
< ); | |
< */ | |
< } | |
< /* | |
< * Backup CDATA sections for later process. | |
< */ | |
< $data['body'] = preg_replace_callback( | |
< '/<!\[CDATA\[.*?\]\]>/s', array($this, 'backup'), $data['body'] | |
< ); | |
< /* | |
< * Comment section must not contain two or more adjacent hyphens. | |
< */ | |
< $data['body'] = preg_replace_callback( | |
< '/<!--(.*?)-->/si', | |
< create_function('$matches', ' | |
< return "<!-- ".preg_replace("/-{2,}/", "-", $matches[1])." -->"; | |
< '), | |
< $data['body'] | |
< ); | |
< /* | |
< * Backup comment sections for later process. | |
< */ | |
< $data['body'] = preg_replace_callback( | |
< '/<!--.*?-->/s', array($this, 'backup'), $data['body'] | |
< ); | |
< /* | |
< * Process tags that is potentially dangerous for XML parsers. | |
< */ | |
< $data['body'] = preg_replace_callback( | |
< '/(<textarea\b[^>]*?>)(.*?)(<\/textarea\b[^>]*?>)/si', | |
< create_function('$matches', ' | |
< return $matches[1].str_replace("<", "<", $matches[2]).$matches[3]; | |
< '), | |
< $data['body'] | |
< ); | |
< $data['body'] = preg_replace_callback( | |
< '/<xmp\b[^>]*?>(.*?)<\/xmp\b[^>]*?>/si', | |
< create_function('$matches', ' | |
< return "<pre>".str_replace("<", "<", $matches[1])."</pre>"; | |
< '), | |
< $data['body'] | |
< ); | |
< $data['body'] = preg_replace_callback( | |
< '/<plaintext\b[^>]*?>(.*)$/si', | |
< create_function('$matches', ' | |
< return "<pre>".str_replace("<", "<", $matches[1])."</pre>"; | |
< '), | |
< $data['body'] | |
< ); | |
< /* | |
< * Remove DTD declarations, wrongly placed comments etc. | |
< * This must be done before removing DOCTYPE. | |
< */ | |
< $data['body'] = preg_replace('/<!(?!DOCTYPE)[^>]*?>/si', '', $data['body']); | |
< /* | |
< * XML and DOCTYPE declaration will be replaced. | |
< */ | |
< $data['body'] = preg_replace('/<!DOCTYPE\b[^>]*?>/si', '', $data['body']); | |
< $data['body'] = preg_replace('/<\?xml\b[^>]*?\?>/si', '', $data['body']); | |
< if (preg_match('/^\s*$/s', $data['body'])) { | |
< throw new Exception('The entity body became empty after preprocessing.'); | |
< } | |
< /* | |
< * Detect character encoding and convert to UTF-8. | |
< */ | |
< $encoding = false; | |
< if (isset($data['headers']['content-type'])) { | |
< $encoding = $this->getCharsetFromCType($data['headers']['content-type']); | |
< } | |
< if (!$encoding and preg_match_all('/<meta\b[^>]*?>/si', $data['body'], $matches)) { | |
< foreach ($matches[0] as $value) { | |
< if (strtolower($this->getAttribute('http-equiv', $value)) == 'content-type' | |
< and false !== $encoding = $this->getAttribute('content', $value)) { | |
< $encoding = $this->getCharsetFromCType($encoding); | |
< break; | |
--- | |
> $cache_lifetime = (int) $cache_lifetime; | |
> $use_cache = !empty($this->cacheDir) and $cache_lifetime > 0; | |
> if ($use_cache) { | |
> $cache = new Cache_Lite(array('cacheDir' => $this->cacheDir, 'lifeTime' => $cache_lifetime)); | |
> $params = array(); | |
> foreach ($headers as $key => $value) { | |
> if (!empty($value)) { | |
> $params[] = urlencode($key).'='.urlencode($value); | |
436a242,248 | |
> foreach ($post as $key => $value) { | |
> $params[] = urlencode($key).'='.urlencode($value); | |
> } | |
> $cache_id = "$url?".implode('&', $params); | |
> if (false !== $data = $cache->get($cache_id)) { | |
> $data = unserialize($data); | |
> } | |
439,441c251,253 | |
< * Use mbstring to convert character encoding if available. | |
< * Otherwise use iconv (iconv may try to detect character encoding automatically). | |
< * Do not trust the declared encoding and do conversion even if UTF-8. | |
--- | |
> * Access to the URL if not cached | |
> * or if the cache has either Last-Modified or Etag header | |
> * and conditional request is specified. | |
443,448c255,261 | |
< if (extension_loaded('mbstring')) { | |
< if (!$encoding) { | |
< @mb_detect_order('ASCII, JIS, UTF-8, EUC-JP, SJIS'); | |
< if (false === $encoding = @mb_preferred_mime_name(@mb_detect_encoding($data['body']))) { | |
< throw new Exception('Failed detecting character encoding.'); | |
< } | |
--- | |
> if ($conditional_request and (!isset($data['headers']['last-modified']) or !isset($data['headers']['etag']))) { | |
> $conditional_request = false; | |
> } | |
> if (!$data or $conditional_request) { | |
> if (isset($data['headers']['last-modified']) | |
> and (!isset($headers['last-modified']) or empty($headers['last-modified']))) { | |
> $headers['last-modified'] = $data['headers']['last-modified']; | |
450,453c263,265 | |
< @mb_convert_variables('UTF-8', $encoding, $data, $this->backup); | |
< } else { | |
< if (false === $data['body'] = @iconv($encoding, 'UTF-8', $data['body'])) { | |
< throw new Exception('Failed converting character encoding.'); | |
--- | |
> if (isset($data['headers']['etag']) | |
> and (!isset($headers['etag']) or empty($headers['etag']))) { | |
> $headers['etag'] = $data['headers']['etag']; | |
455,457c267,467 | |
< foreach ($this->backup as $key => $value) { | |
< if (false === $this->backup[$key] = @iconv($encoding, 'UTF-8', $value)) { | |
< throw new Exception('Failed converting character encoding.'); | |
--- | |
> try { | |
> $response = $this->getHttpResponse($url, $headers, $post); | |
> } catch (Exception $e) { | |
> if (!$data) { | |
> throw $e; | |
> } | |
> } | |
> /* | |
> * Use cache if the responded HTTP status code is 304. | |
> * If 200, format the responded HTML of the given URL to XHTML. | |
> */ | |
> if (!$data or (isset($response['code']) and $response['code'] != 304)) { | |
> $data =& $response; | |
> /* | |
> * If status code was 200 and Content-Type was not (X)HTML, | |
> * the status code was forcibly altered to 204. | |
> * @see HTTP_Request_Listener_Extended->update(). | |
> */ | |
> if ($data['code'] != 200 and $data['code'] != 204) { | |
> throw new Exception("Responded HTTP Status Code is $data[code]."); | |
> } elseif (isset($data['headers']['content-type']) | |
> and !preg_match('/^(?:text|application)\/x?html\b/', $data['headers']['content-type'])) { | |
> throw new Exception("Responded Content-Type is {$data['headers']['content-type']}"); | |
> } elseif (empty($data['body'])) { | |
> throw new Exception("Responded entity body is empty."); | |
> } elseif (!preg_match('/<\w+[^>]*?>/', $data['body'], $matches)) { | |
> throw new Exception("Responded entity body does not contain a markup symbol."); | |
> } elseif (false !== strpos($matches[0], "\x0")) { | |
> throw new Exception("Responded entity body contains NULL."); | |
> } | |
> /* | |
> * Remove BOM and NULLs. | |
> */ | |
> $data['body'] = preg_replace('/^\xef\xbb\xbf/', '' , $data['body']); | |
> $data['body'] = str_replace("\x0", '', $data['body']); | |
> /* | |
> * Initialize the backups. | |
> */ | |
> $this->backup = array(); | |
> $this->backup_count = 0; | |
> /* | |
> * Removing SCRIPT and STYLE is recommended. | |
> * The following substitute code will capsulate the content of the tags in CDATA. | |
> * If use it, be sure that some JavaScript method such as document.write | |
> * is not compliant with XHTML/XML. | |
> */ | |
> $tags = array('script', 'style'); | |
> foreach ($tags as $tag) { | |
> $data['body'] = preg_replace("/<$tag\b[^>]*?>.*?<\/$tag\b[^>]*?>/si", '' , $data['body']); | |
> /* | |
> $data['body'] = preg_replace_callback( | |
> "/(<$tag\b[^>]*?>)(.*?)(<\/$tag\b[^>]*?>)/si", | |
> create_function('$matches', ' | |
> $content = trim($matches[2]); | |
> if (empty($content) | |
> or preg_match("/^<!\[CDATA\[.*?\]\]>$/s", $content)) { | |
> return $matches[0]; | |
> } else { | |
> $content = preg_replace("/^<!-+/", "", $content); | |
> $content = preg_replace("/-+>$/", "", $content); | |
> $content = preg_replace("/\s*\/\/$/s", "", trim($content)); | |
> return "$matches[1]<![CDATA[\n$content\n]]>$matches[3]"; | |
> } | |
> '), | |
> $data['body'] | |
> ); | |
> */ | |
> } | |
> /* | |
> * Backup CDATA sections for later process. | |
> */ | |
> $data['body'] = preg_replace_callback( | |
> '/<!\[CDATA\[.*?\]\]>/s', array($this, 'backup'), $data['body'] | |
> ); | |
> /* | |
> * Comment section must not contain two or more adjacent hyphens. | |
> */ | |
> $data['body'] = preg_replace_callback( | |
> '/<!--(.*?)-->/si', | |
> create_function('$matches', ' | |
> return "<!-- ".preg_replace("/-{2,}/", "-", $matches[1])." -->"; | |
> '), | |
> $data['body'] | |
> ); | |
> /* | |
> * Backup comment sections for later process. | |
> */ | |
> $data['body'] = preg_replace_callback( | |
> '/<!--.*?-->/s', array($this, 'backup'), $data['body'] | |
> ); | |
> /* | |
> * Process tags that is potentially dangerous for XML parsers. | |
> */ | |
> $data['body'] = preg_replace_callback( | |
> '/(<textarea\b[^>]*?>)(.*?)(<\/textarea\b[^>]*?>)/si', | |
> create_function('$matches', ' | |
> return $matches[1].str_replace("<", "<", $matches[2]).$matches[3]; | |
> '), | |
> $data['body'] | |
> ); | |
> $data['body'] = preg_replace_callback( | |
> '/<xmp\b[^>]*?>(.*?)<\/xmp\b[^>]*?>/si', | |
> create_function('$matches', ' | |
> return "<pre>".str_replace("<", "<", $matches[1])."</pre>"; | |
> '), | |
> $data['body'] | |
> ); | |
> $data['body'] = preg_replace_callback( | |
> '/<plaintext\b[^>]*?>(.*)$/si', | |
> create_function('$matches', ' | |
> return "<pre>".str_replace("<", "<", $matches[1])."</pre>"; | |
> '), | |
> $data['body'] | |
> ); | |
> /* | |
> * Remove DTD declarations, wrongly placed comments etc. | |
> * This must be done before removing DOCTYPE. | |
> */ | |
> $data['body'] = preg_replace('/<!(?!DOCTYPE)[^>]*?>/si', '', $data['body']); | |
> /* | |
> * XML and DOCTYPE declaration will be replaced. | |
> */ | |
> $data['body'] = preg_replace('/<!DOCTYPE\b[^>]*?>/si', '', $data['body']); | |
> $data['body'] = preg_replace('/<\?xml\b[^>]*?\?>/si', '', $data['body']); | |
> if (preg_match('/^\s*$/s', $data['body'])) { | |
> throw new Exception('The entity body became empty after preprocessing.'); | |
> } | |
> /* | |
> * Detect character encoding and convert to UTF-8. | |
> */ | |
> $encoding = false; | |
> if (isset($data['headers']['content-type'])) { | |
> $encoding = $this->getCharsetFromCType($data['headers']['content-type']); | |
> } | |
> if (!$encoding and preg_match_all('/<meta\b[^>]*?>/si', $data['body'], $matches)) { | |
> foreach ($matches[0] as $value) { | |
> if (strtolower($this->getAttribute('http-equiv', $value)) == 'content-type' | |
> and false !== $encoding = $this->getAttribute('content', $value)) { | |
> $encoding = $this->getCharsetFromCType($encoding); | |
> break; | |
> } | |
> } | |
> } | |
> /* | |
> * Use mbstring to convert character encoding if available. | |
> * Otherwise use iconv (iconv may try to detect character encoding automatically). | |
> * Do not trust the declared encoding and do conversion even if UTF-8. | |
> */ | |
> if (extension_loaded('mbstring')) { | |
> if (!$encoding) { | |
> @mb_detect_order('ASCII, JIS, UTF-8, EUC-JP, SJIS'); | |
> if (false === $encoding = @mb_preferred_mime_name(@mb_detect_encoding($data['body']))) { | |
> throw new Exception('Failed detecting character encoding.'); | |
> } | |
> } | |
> @mb_convert_variables('UTF-8', $encoding, $data, $this->backup); | |
> } else { | |
> if (false === $data['body'] = @iconv($encoding, 'UTF-8', $data['body'])) { | |
> throw new Exception('Failed converting character encoding.'); | |
> } | |
> foreach ($this->backup as $key => $value) { | |
> if (false === $this->backup[$key] = @iconv($encoding, 'UTF-8', $value)) { | |
> throw new Exception('Failed converting character encoding.'); | |
> } | |
> } | |
> } | |
> /* | |
> * Restore CDATAs and comments. | |
> */ | |
> for ($i = 0; $i < $this->backup_count; $i++) { | |
> $data['body'] = str_replace("<restore count=\"$i\" />", $this->backup[$i], $data['body']); | |
> } | |
> /* | |
> * Use Tidy to format HTML if available. | |
> * Otherwise, use HTMLParser class (is slower and consumes much memory). | |
> */ | |
> if (extension_loaded('tidy')) { | |
> $tidy = new tidy; | |
> $tidy->parseString($data['body'], array('output-xhtml' => true), 'UTF8'); | |
> $tidy->cleanRepair(); | |
> $data['body'] = $tidy->html(); | |
> } else { | |
> require_once 'HTMLParser.class.php'; | |
> $parser = new HTMLParser; | |
> $format_rule = require 'xhtml1-transitional_dtd.inc.php'; | |
> $parser->setRule($format_rule); | |
> $parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml')); | |
> $parser->setGenericParent('body'); | |
> $parser->parse($data['body']); | |
> $data['body'] = $parser->dump(); | |
> } | |
> /* | |
> * Valid XHTML DOCTYPE declaration (with DTD URI) is required | |
> * for SimpleXMLElement->asXML() method to produce proper XHTML tags. | |
> */ | |
> $declarations = '<?xml version="1.0" encoding="UTF-8"?>'; | |
> $declarations .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" '; | |
> $declarations .= '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'; | |
> $data['body'] = "$declarations$data[body]"; | |
> if ($use_cache) { | |
> $cache->save(serialize($data), $cache_id); | |
461,495c471 | |
< /* | |
< * Restore CDATAs and comments. | |
< */ | |
< for ($i = 0; $i < $this->backup_count; $i++) { | |
< $data['body'] = str_replace("<restore count=\"$i\" />", $this->backup[$i], $data['body']); | |
< } | |
< /* | |
< * Use Tidy to format HTML if available. | |
< * Otherwise, use HTMLParser class (is slower and consumes much memory). | |
< */ | |
< if (extension_loaded('tidy')) { | |
< $tidy = new tidy; | |
< $tidy->parseString($data['body'], array('output-xhtml' => true), 'UTF8'); | |
< $tidy->cleanRepair(); | |
< $data['body'] = $tidy->html(); | |
< } else { | |
< require_once 'HTMLParser.class.php'; | |
< $parser = new HTMLParser; | |
< $format_rule = require 'xhtml1-transitional_dtd.inc.php'; | |
< $parser->setRule($format_rule); | |
< $parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml')); | |
< $parser->setGenericParent('body'); | |
< $parser->parse($data['body']); | |
< $data['body'] = $parser->dump(); | |
< } | |
< /* | |
< * Valid XHTML DOCTYPE declaration (with DTD URI) is required | |
< * for SimpleXMLElement->asXML() method to produce proper XHTML tags. | |
< */ | |
< $declarations = '<?xml version="1.0" encoding="UTF-8"?>'; | |
< $declarations .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" '; | |
< $declarations .= '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'; | |
< $data['body'] = "$declarations$data[body]"; | |
< | |
< return $data ; | |
--- | |
> return $data; | |
496a473 | |
> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment