Last active
November 18, 2025 03:23
-
-
Save hoytzhang/6522a721d7797b3fe9f2dcc2dc205cf8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| /** | |
| * Sitemap/Feed 最新URL提取工具 | |
| * 用于从sitemap或RSS/Atom feed中提取最新的10个URL地址 | |
| */ | |
| class SitemapParser { | |
| private $sitemapUrl; | |
| public function __construct($sitemapUrl) { | |
| $this->sitemapUrl = $sitemapUrl; | |
| } | |
| /** | |
| * 获取并解析sitemap | |
| */ | |
| public function getLatestUrls($limit = 10) { | |
| // 获取sitemap内容 | |
| $content = $this->fetchSitemap(); | |
| if (!$content) { | |
| throw new Exception("无法获取sitemap内容"); | |
| } | |
| // 解析XML | |
| $urls = $this->parseSitemap($content); | |
| // 按日期排序并获取最新的URL | |
| usort($urls, function($a, $b) { | |
| $dateA = isset($a['lastmod']) ? strtotime($a['lastmod']) : 0; | |
| $dateB = isset($b['lastmod']) ? strtotime($b['lastmod']) : 0; | |
| return $dateB - $dateA; | |
| }); | |
| // 返回最新的URL | |
| return array_slice($urls, 0, $limit); | |
| } | |
| /** | |
| * 获取sitemap内容 | |
| */ | |
| private function fetchSitemap() { | |
| $ch = curl_init(); | |
| curl_setopt($ch, CURLOPT_URL, $this->sitemapUrl); | |
| curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
| curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
| curl_setopt($ch, CURLOPT_TIMEOUT, 30); | |
| curl_setopt($ch, CURLOPT_USERAGENT, 'Sitemap Parser 1.0'); | |
| $content = curl_exec($ch); | |
| $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); | |
| curl_close($ch); | |
| if ($httpCode !== 200) { | |
| return false; | |
| } | |
| return $content; | |
| } | |
| /** | |
| * 解析sitemap XML | |
| */ | |
| private function parseSitemap($content) { | |
| $urls = []; | |
| $xml = simplexml_load_string($content); | |
| if ($xml === false) { | |
| throw new Exception("无效的XML格式"); | |
| } | |
| // 处理索引sitemap (包含多个sitemap文件) | |
| if (isset($xml->sitemap)) { | |
| foreach ($xml->sitemap as $sitemap) { | |
| if (isset($sitemap->loc)) { | |
| try { | |
| $subContent = $this->fetchSitemap((string)$sitemap->loc); | |
| if ($subContent) { | |
| $subUrls = $this->parseSitemap($subContent); | |
| $urls = array_merge($urls, $subUrls); | |
| } | |
| } catch (Exception $e) { | |
| // 忽略子sitemap错误 | |
| continue; | |
| } | |
| } | |
| } | |
| } | |
| // 处理普通sitemap (包含URL列表) | |
| else if (isset($xml->url)) { | |
| foreach ($xml->url as $url) { | |
| $urlData = [ | |
| 'loc' => (string)$url->loc, | |
| ]; | |
| if (isset($url->lastmod)) { | |
| $urlData['lastmod'] = (string)$url->lastmod; | |
| } | |
| if (isset($url->changefreq)) { | |
| $urlData['changefreq'] = (string)$url->changefreq; | |
| } | |
| if (isset($url->priority)) { | |
| $urlData['priority'] = (string)$url->priority; | |
| } | |
| $urls[] = $urlData; | |
| } | |
| } | |
| return $urls; | |
| } | |
| } | |
| class FeedParser { | |
| private $feedUrl; | |
| public function __construct($feedUrl) { | |
| $this->feedUrl = $feedUrl; | |
| } | |
| /** | |
| * 获取并解析feed,返回最新的URL | |
| */ | |
| public function getLatestUrls($limit = 10) { | |
| // 获取feed内容 | |
| $content = $this->fetchFeed(); | |
| if (!$content) { | |
| throw new Exception("无法获取feed内容"); | |
| } | |
| // 解析feed | |
| $urls = $this->parseFeed($content); | |
| // 按日期排序并获取最新的URL | |
| usort($urls, function($a, $b) { | |
| $dateA = isset($a['timestamp']) ? $a['timestamp'] : 0; | |
| $dateB = isset($b['timestamp']) ? $b['timestamp'] : 0; | |
| return $dateB - $dateA; | |
| }); | |
| // 返回最新的URL | |
| return array_slice($urls, 0, $limit); | |
| } | |
| /** | |
| * 获取feed内容 | |
| */ | |
| private function fetchFeed() { | |
| $ch = curl_init(); | |
| curl_setopt($ch, CURLOPT_URL, $this->feedUrl); | |
| curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
| curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
| curl_setopt($ch, CURLOPT_TIMEOUT, 30); | |
| curl_setopt($ch, CURLOPT_USERAGENT, 'Feed Parser 1.0'); | |
| $content = curl_exec($ch); | |
| $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); | |
| curl_close($ch); | |
| if ($httpCode !== 200) { | |
| return false; | |
| } | |
| return $content; | |
| } | |
| /** | |
| * 解析feed内容 | |
| */ | |
| private function parseFeed($content) { | |
| $urls = []; | |
| // 尝试解析为RSS | |
| $rss = @simplexml_load_string($content); | |
| if ($rss && isset($rss->channel->item)) { | |
| foreach ($rss->channel->item as $item) { | |
| $urlData = [ | |
| 'loc' => (string)$item->link, | |
| 'title' => (string)$item->title, | |
| 'timestamp' => isset($item->pubDate) ? strtotime((string)$item->pubDate) : time() | |
| ]; | |
| if (isset($item->description)) { | |
| $urlData['description'] = (string)$item->description; | |
| } | |
| $urls[] = $urlData; | |
| } | |
| return $urls; | |
| } | |
| // 尝试解析为Atom | |
| $atom = @simplexml_load_string($content); | |
| if ($atom && isset($atom->entry)) { | |
| foreach ($atom->entry as $entry) { | |
| $urlData = [ | |
| 'loc' => (string)$entry->link['href'], | |
| 'title' => (string)$entry->title, | |
| 'timestamp' => isset($entry->updated) ? strtotime((string)$entry->updated) : | |
| (isset($entry->published) ? strtotime((string)$entry->published) : time()) | |
| ]; | |
| if (isset($entry->summary)) { | |
| $urlData['description'] = (string)$entry->summary; | |
| } | |
| $urls[] = $urlData; | |
| } | |
| return $urls; | |
| } | |
| throw new Exception("无效的Feed格式"); | |
| } | |
| } | |
| // Web界面处理 | |
| if ($_SERVER['REQUEST_METHOD'] === 'GET') { | |
| showForm(); | |
| } else if ($_SERVER['REQUEST_METHOD'] === 'POST') { | |
| if (isset($_POST['action']) && $_POST['action'] === 'push') { | |
| handlePush(); | |
| } else { | |
| handlePost(); | |
| } | |
| } | |
| function showForm() { | |
| ?> | |
| <!DOCTYPE html> | |
| <html> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <title>Sitemap/Feed 最新URL提取工具</title> | |
| <style> | |
| body { font-family: Arial, sans-serif; margin: 40px; background-color: #f5f5f5; } | |
| .container { max-width: 800px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); } | |
| h1 { color: #333; text-align: center; } | |
| .form-group { margin-bottom: 20px; } | |
| label { display: block; margin-bottom: 5px; font-weight: bold; } | |
| input[type="url"] { width: 100%; padding: 12px; border: 1px solid #ddd; border-radius: 4px; box-sizing: border-box; } | |
| button { background-color: #007cba; color: white; padding: 12px 24px; border: none; border-radius: 4px; cursor: pointer; font-size: 16px; } | |
| button:hover { background-color: #005a87; } | |
| .result { margin-top: 30px; } | |
| .url-item { padding: 10px; border-bottom: 1px solid #eee; } | |
| .url-link { color: #007cba; text-decoration: none; } | |
| .url-link:hover { text-decoration: underline; } | |
| .lastmod { color: #666; font-size: 14px; margin-top: 5px; } | |
| .error { color: #d63638; background: #fcf0f1; padding: 15px; border-radius: 4px; margin: 20px 0; } | |
| .success { color: #008a20; background: #edfaef; padding: 15px; border-radius: 4px; margin: 20px 0; } | |
| .push-section { margin-top: 30px; padding: 20px; background: #f9f9f9; border-radius: 4px; } | |
| .push-form { margin-top: 15px; } | |
| .push-form input[type="text"] { width: 100%; padding: 8px; margin: 10px 0; } | |
| .push-button { background-color: #333; color: white; padding: 10px 20px; border: none; border-radius: 4px; cursor: pointer; } | |
| .push-button:hover { background-color: #555; } | |
| .push-result { margin-top: 20px; padding: 15px; border-radius: 4px; } | |
| .push-success { background: #edfaef; color: #008a20; } | |
| .push-error { background: #fcf0f1; color: #d63638; } | |
| .source-type { margin-bottom: 15px; } | |
| .source-type label { display: inline-block; margin-right: 20px; font-weight: normal; } | |
| .source-type input { margin-right: 5px; } | |
| .description { color: #666; font-size: 14px; margin-top: 5px; } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="container"> | |
| <h1>Sitemap/Feed 最新URL提取工具</h1> | |
| <form method="post"> | |
| <div class="source-type"> | |
| <label><input type="radio" name="source_type" value="sitemap" checked> Sitemap</label> | |
| <label><input type="radio" name="source_type" value="feed"> RSS/Atom Feed</label> | |
| </div> | |
| <div class="form-group"> | |
| <label for="source_url">请输入地址:</label> | |
| <input type="url" id="source_url" name="source_url" placeholder="https://example.com/sitemap.xml 或 https://example.com/feed" required> | |
| </div> | |
| <button type="submit">获取最新URL</button> | |
| </form> | |
| </div> | |
| </body> | |
| </html> | |
| <?php | |
| } | |
| function handlePost() { | |
| $sourceType = $_POST['source_type'] ?? 'sitemap'; | |
| $sourceUrl = $_POST['source_url'] ?? ''; | |
| if (empty($sourceUrl)) { | |
| echo '<div class="error">请输入有效的地址</div>'; | |
| showForm(); | |
| return; | |
| } | |
| try { | |
| if ($sourceType === 'sitemap') { | |
| $parser = new SitemapParser($sourceUrl); | |
| } else { | |
| $parser = new FeedParser($sourceUrl); | |
| } | |
| $latestUrls = $parser->getLatestUrls(10); | |
| ?> | |
| <!DOCTYPE html> | |
| <html> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <title>结果 - 最新URL提取工具</title> | |
| <style> | |
| body { font-family: Arial, sans-serif; margin: 40px; background-color: #f5f5f5; } | |
| .container { max-width: 800px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); } | |
| h1 { color: #333; text-align: center; } | |
| .back-link { margin-bottom: 20px; } | |
| a { color: #007cba; text-decoration: none; } | |
| a:hover { text-decoration: underline; } | |
| .result { margin-top: 20px; } | |
| .url-item { padding: 15px; border-bottom: 1px solid #eee; } | |
| .url-item:last-child { border-bottom: none; } | |
| .url-link { color: #007cba; text-decoration: none; font-weight: bold; } | |
| .url-link:hover { text-decoration: underline; } | |
| .lastmod { color: #666; font-size: 14px; margin-top: 5px; } | |
| .no-results { color: #666; text-align: center; padding: 20px; } | |
| .push-section { margin-top: 30px; padding: 20px; background: #f9f9f9; border-radius: 4px; } | |
| .push-form { margin-top: 15px; } | |
| .push-form input[type="text"] { width: 100%; padding: 8px; margin: 10px 0; } | |
| .push-button { background-color: #333; color: white; padding: 10px 20px; border: none; border-radius: 4px; cursor: pointer; } | |
| .push-button:hover { background-color: #555; } | |
| .push-result { margin-top: 20px; padding: 15px; border-radius: 4px; } | |
| .push-success { background: #edfaef; color: #008a20; } | |
| .push-error { background: #fcf0f1; color: #d63638; } | |
| .source-info { background: #eef7fa; padding: 10px; border-radius: 4px; margin-bottom: 20px; } | |
| .description { color: #666; font-size: 14px; margin-top: 5px; } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="container"> | |
| <div class="back-link"> | |
| <a href="<?php echo $_SERVER['PHP_SELF']; ?>">← 返回</a> | |
| </div> | |
| <h1>最新更新的URL</h1> | |
| <div class="source-info"> | |
| 来源类型: <?php echo $sourceType === 'sitemap' ? 'Sitemap' : 'RSS/Atom Feed'; ?><br> | |
| 来源地址: <?php echo htmlspecialchars($sourceUrl); ?> | |
| </div> | |
| <div class="result"> | |
| <?php if (empty($latestUrls)): ?> | |
| <div class="no-results">未找到任何URL</div> | |
| <?php else: ?> | |
| <?php foreach ($latestUrls as $index => $url): ?> | |
| <div class="url-item"> | |
| <div> | |
| <a href="<?php echo htmlspecialchars($url['loc']); ?>" target="_blank" class="url-link"> | |
| <?php echo htmlspecialchars($url['title'] ?? $url['loc']); ?> | |
| </a> | |
| </div> | |
| <?php if (isset($url['lastmod']) || isset($url['timestamp'])): ?> | |
| <div class="lastmod"> | |
| <?php | |
| if (isset($url['lastmod'])) { | |
| echo '最后修改时间: ' . date('Y-m-d H:i:s', strtotime($url['lastmod'])); | |
| } elseif (isset($url['timestamp'])) { | |
| echo '发布时间: ' . date('Y-m-d H:i:s', $url['timestamp']); | |
| } | |
| ?> | |
| </div> | |
| <?php endif; ?> | |
| <?php if (isset($url['description'])): ?> | |
| <div class="description"> | |
| <?php echo htmlspecialchars(mb_substr(strip_tags($url['description']), 0, 150)) . '...'; ?> | |
| </div> | |
| <?php endif; ?> | |
| </div> | |
| <?php endforeach; ?> | |
| <?php endif; ?> | |
| </div> | |
| <?php if (!empty($latestUrls)): ?> | |
| <div class="push-section"> | |
| <h3>推送到百度</h3> | |
| <p>是否要将以上URL推送到百度站长平台?</p> | |
| <form method="post" class="push-form"> | |
| <input type="hidden" name="action" value="push"> | |
| <input type="hidden" name="source_type" value="<?php echo htmlspecialchars($sourceType); ?>"> | |
| <?php foreach ($latestUrls as $url): ?> | |
| <input type="hidden" name="urls[]" value="<?php echo htmlspecialchars($url['loc']); ?>"> | |
| <?php endforeach; ?> | |
| <div class="form-group"> | |
| <label for="baidu_token">百度推送密钥:</label> | |
| <input type="text" id="baidu_token" name="baidu_token" placeholder="请输入百度站长平台的推送密钥" required> | |
| </div> | |
| <button type="submit" class="push-button">推送到百度</button> | |
| </form> | |
| </div> | |
| <?php endif; ?> | |
| </div> | |
| </body> | |
| </html> | |
| <?php | |
| } catch (Exception $e) { | |
| ?> | |
| <!DOCTYPE html> | |
| <html> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <title>错误 - 最新URL提取工具</title> | |
| <style> | |
| body { font-family: Arial, sans-serif; margin: 40px; background-color: #f5f5f5; } | |
| .container { max-width: 800px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); } | |
| .back-link { margin-bottom: 20px; } | |
| a { color: #007cba; text-decoration: none; } | |
| a:hover { text-decoration: underline; } | |
| .error { color: #d63638; background: #fcf0f1; padding: 15px; border-radius: 4px; } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="container"> | |
| <div class="back-link"> | |
| <a href="<?php echo $_SERVER['PHP_SELF']; ?>">← 返回重试</a> | |
| </div> | |
| <div class="error"> | |
| <strong>处理过程中发生错误:</strong><br> | |
| <?php echo htmlspecialchars($e->getMessage()); ?> | |
| </div> | |
| </div> | |
| </body> | |
| </html> | |
| <?php | |
| } | |
| } | |
| function handlePush() { | |
| $urls = $_POST['urls'] ?? []; | |
| $token = $_POST['baidu_token'] ?? ''; | |
| $sourceType = $_POST['source_type'] ?? 'sitemap'; | |
| if (empty($token)) { | |
| echo '<div class="error">请输入百度推送密钥</div>'; | |
| return; | |
| } | |
| if (empty($urls)) { | |
| echo '<div class="error">没有可推送的URL</div>'; | |
| return; | |
| } | |
| // 提取站点域名 | |
| $site = ''; | |
| if (!empty($urls[0])) { | |
| $parsedUrl = parse_url($urls[0]); | |
| $site = $parsedUrl['scheme'] . '://' . $parsedUrl['host']; | |
| } | |
| $api = "http://data.zz.baidu.com/urls?site={$site}&token={$token}"; | |
| $ch = curl_init(); | |
| $options = array( | |
| CURLOPT_URL => $api, | |
| CURLOPT_POST => true, | |
| CURLOPT_RETURNTRANSFER => true, | |
| CURLOPT_POSTFIELDS => implode("\n", $urls), | |
| CURLOPT_HTTPHEADER => array('Content-Type: text/plain'), | |
| ); | |
| curl_setopt_array($ch, $options); | |
| $result = curl_exec($ch); | |
| $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); | |
| curl_close($ch); | |
| // 显示推送结果 | |
| ?> | |
| <!DOCTYPE html> | |
| <html> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <title>推送结果 - 最新URL提取工具</title> | |
| <style> | |
| body { font-family: Arial, sans-serif; margin: 40px; background-color: #f5f5f5; } | |
| .container { max-width: 800px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); } | |
| h1 { color: #333; text-align: center; } | |
| .back-link { margin-bottom: 20px; } | |
| a { color: #007cba; text-decoration: none; } | |
| a:hover { text-decoration: underline; } | |
| .push-result { margin-top: 20px; padding: 15px; border-radius: 4px; } | |
| .push-success { background: #edfaef; color: #008a20; } | |
| .push-error { background: #fcf0f1; color: #d63638; } | |
| .url-list { margin-top: 20px; } | |
| .url-item { padding: 5px 0; border-bottom: 1px solid #eee; } | |
| .source-info { background: #eef7fa; padding: 10px; border-radius: 4px; margin-bottom: 20px; } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="container"> | |
| <div class="back-link"> | |
| <a href="<?php echo $_SERVER['PHP_SELF']; ?>">← 返回首页</a> | |
| </div> | |
| <h1>百度推送结果</h1> | |
| <div class="source-info"> | |
| 来源类型: <?php echo $sourceType === 'sitemap' ? 'Sitemap' : 'RSS/Atom Feed'; ?> | |
| </div> | |
| <?php if ($httpCode !== 200): ?> | |
| <div class="push-result push-error"> | |
| <strong>推送失败:</strong><br> | |
| HTTP状态码: <?php echo $httpCode; ?> | |
| </div> | |
| <?php else: | |
| $response = json_decode($result, true); | |
| ?> | |
| <div class="push-result <?php echo isset($response['error']) ? 'push-error' : 'push-success'; ?>"> | |
| <?php if (isset($response['error'])): ?> | |
| <strong>推送失败:</strong><br> | |
| 错误代码: <?php echo htmlspecialchars($response['error']); ?><br> | |
| 错误信息: <?php echo htmlspecialchars($response['message']); ?> | |
| <?php else: ?> | |
| <strong>推送成功!</strong><br> | |
| 成功推送: <?php echo $response['success'] ?? 'N/A'; ?> 条URL<br> | |
| 剩余配额: <?php echo $response['remain'] ?? 'N/A'; ?> | |
| <?php endif; ?> | |
| </div> | |
| <?php endif; ?> | |
| <div class="url-list"> | |
| <h3>推送的URL列表:</h3> | |
| <?php foreach ($urls as $url): ?> | |
| <div class="url-item"><?php echo htmlspecialchars($url); ?></div> | |
| <?php endforeach; ?> | |
| </div> | |
| </div> | |
| </body> | |
| </html> | |
| <?php | |
| } | |
| ?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment