Skip to content

Instantly share code, notes, and snippets.

@hoytzhang
Last active November 18, 2025 03:23
Show Gist options
  • Select an option

  • Save hoytzhang/6522a721d7797b3fe9f2dcc2dc205cf8 to your computer and use it in GitHub Desktop.

Select an option

Save hoytzhang/6522a721d7797b3fe9f2dcc2dc205cf8 to your computer and use it in GitHub Desktop.
<?php
/**
* Sitemap/Feed 最新URL提取工具
* 用于从sitemap或RSS/Atom feed中提取最新的10个URL地址
*/
class SitemapParser {
private $sitemapUrl;
public function __construct($sitemapUrl) {
$this->sitemapUrl = $sitemapUrl;
}
/**
* 获取并解析sitemap
*/
public function getLatestUrls($limit = 10) {
// 获取sitemap内容
$content = $this->fetchSitemap();
if (!$content) {
throw new Exception("无法获取sitemap内容");
}
// 解析XML
$urls = $this->parseSitemap($content);
// 按日期排序并获取最新的URL
usort($urls, function($a, $b) {
$dateA = isset($a['lastmod']) ? strtotime($a['lastmod']) : 0;
$dateB = isset($b['lastmod']) ? strtotime($b['lastmod']) : 0;
return $dateB - $dateA;
});
// 返回最新的URL
return array_slice($urls, 0, $limit);
}
/**
* 获取sitemap内容
*/
private function fetchSitemap() {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $this->sitemapUrl);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_USERAGENT, 'Sitemap Parser 1.0');
$content = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($httpCode !== 200) {
return false;
}
return $content;
}
/**
* 解析sitemap XML
*/
private function parseSitemap($content) {
$urls = [];
$xml = simplexml_load_string($content);
if ($xml === false) {
throw new Exception("无效的XML格式");
}
// 处理索引sitemap (包含多个sitemap文件)
if (isset($xml->sitemap)) {
foreach ($xml->sitemap as $sitemap) {
if (isset($sitemap->loc)) {
try {
$subContent = $this->fetchSitemap((string)$sitemap->loc);
if ($subContent) {
$subUrls = $this->parseSitemap($subContent);
$urls = array_merge($urls, $subUrls);
}
} catch (Exception $e) {
// 忽略子sitemap错误
continue;
}
}
}
}
// 处理普通sitemap (包含URL列表)
else if (isset($xml->url)) {
foreach ($xml->url as $url) {
$urlData = [
'loc' => (string)$url->loc,
];
if (isset($url->lastmod)) {
$urlData['lastmod'] = (string)$url->lastmod;
}
if (isset($url->changefreq)) {
$urlData['changefreq'] = (string)$url->changefreq;
}
if (isset($url->priority)) {
$urlData['priority'] = (string)$url->priority;
}
$urls[] = $urlData;
}
}
return $urls;
}
}
class FeedParser {
private $feedUrl;
public function __construct($feedUrl) {
$this->feedUrl = $feedUrl;
}
/**
* 获取并解析feed,返回最新的URL
*/
public function getLatestUrls($limit = 10) {
// 获取feed内容
$content = $this->fetchFeed();
if (!$content) {
throw new Exception("无法获取feed内容");
}
// 解析feed
$urls = $this->parseFeed($content);
// 按日期排序并获取最新的URL
usort($urls, function($a, $b) {
$dateA = isset($a['timestamp']) ? $a['timestamp'] : 0;
$dateB = isset($b['timestamp']) ? $b['timestamp'] : 0;
return $dateB - $dateA;
});
// 返回最新的URL
return array_slice($urls, 0, $limit);
}
/**
* 获取feed内容
*/
private function fetchFeed() {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $this->feedUrl);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_USERAGENT, 'Feed Parser 1.0');
$content = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if ($httpCode !== 200) {
return false;
}
return $content;
}
/**
* 解析feed内容
*/
private function parseFeed($content) {
$urls = [];
// 尝试解析为RSS
$rss = @simplexml_load_string($content);
if ($rss && isset($rss->channel->item)) {
foreach ($rss->channel->item as $item) {
$urlData = [
'loc' => (string)$item->link,
'title' => (string)$item->title,
'timestamp' => isset($item->pubDate) ? strtotime((string)$item->pubDate) : time()
];
if (isset($item->description)) {
$urlData['description'] = (string)$item->description;
}
$urls[] = $urlData;
}
return $urls;
}
// 尝试解析为Atom
$atom = @simplexml_load_string($content);
if ($atom && isset($atom->entry)) {
foreach ($atom->entry as $entry) {
$urlData = [
'loc' => (string)$entry->link['href'],
'title' => (string)$entry->title,
'timestamp' => isset($entry->updated) ? strtotime((string)$entry->updated) :
(isset($entry->published) ? strtotime((string)$entry->published) : time())
];
if (isset($entry->summary)) {
$urlData['description'] = (string)$entry->summary;
}
$urls[] = $urlData;
}
return $urls;
}
throw new Exception("无效的Feed格式");
}
}
// Web界面处理
if ($_SERVER['REQUEST_METHOD'] === 'GET') {
showForm();
} else if ($_SERVER['REQUEST_METHOD'] === 'POST') {
if (isset($_POST['action']) && $_POST['action'] === 'push') {
handlePush();
} else {
handlePost();
}
}
function showForm() {
?>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Sitemap/Feed 最新URL提取工具</title>
<style>
body { font-family: Arial, sans-serif; margin: 40px; background-color: #f5f5f5; }
.container { max-width: 800px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
h1 { color: #333; text-align: center; }
.form-group { margin-bottom: 20px; }
label { display: block; margin-bottom: 5px; font-weight: bold; }
input[type="url"] { width: 100%; padding: 12px; border: 1px solid #ddd; border-radius: 4px; box-sizing: border-box; }
button { background-color: #007cba; color: white; padding: 12px 24px; border: none; border-radius: 4px; cursor: pointer; font-size: 16px; }
button:hover { background-color: #005a87; }
.result { margin-top: 30px; }
.url-item { padding: 10px; border-bottom: 1px solid #eee; }
.url-link { color: #007cba; text-decoration: none; }
.url-link:hover { text-decoration: underline; }
.lastmod { color: #666; font-size: 14px; margin-top: 5px; }
.error { color: #d63638; background: #fcf0f1; padding: 15px; border-radius: 4px; margin: 20px 0; }
.success { color: #008a20; background: #edfaef; padding: 15px; border-radius: 4px; margin: 20px 0; }
.push-section { margin-top: 30px; padding: 20px; background: #f9f9f9; border-radius: 4px; }
.push-form { margin-top: 15px; }
.push-form input[type="text"] { width: 100%; padding: 8px; margin: 10px 0; }
.push-button { background-color: #333; color: white; padding: 10px 20px; border: none; border-radius: 4px; cursor: pointer; }
.push-button:hover { background-color: #555; }
.push-result { margin-top: 20px; padding: 15px; border-radius: 4px; }
.push-success { background: #edfaef; color: #008a20; }
.push-error { background: #fcf0f1; color: #d63638; }
.source-type { margin-bottom: 15px; }
.source-type label { display: inline-block; margin-right: 20px; font-weight: normal; }
.source-type input { margin-right: 5px; }
.description { color: #666; font-size: 14px; margin-top: 5px; }
</style>
</head>
<body>
<div class="container">
<h1>Sitemap/Feed 最新URL提取工具</h1>
<form method="post">
<div class="source-type">
<label><input type="radio" name="source_type" value="sitemap" checked> Sitemap</label>
<label><input type="radio" name="source_type" value="feed"> RSS/Atom Feed</label>
</div>
<div class="form-group">
<label for="source_url">请输入地址:</label>
<input type="url" id="source_url" name="source_url" placeholder="https://example.com/sitemap.xml 或 https://example.com/feed" required>
</div>
<button type="submit">获取最新URL</button>
</form>
</div>
</body>
</html>
<?php
}
function handlePost() {
$sourceType = $_POST['source_type'] ?? 'sitemap';
$sourceUrl = $_POST['source_url'] ?? '';
if (empty($sourceUrl)) {
echo '<div class="error">请输入有效的地址</div>';
showForm();
return;
}
try {
if ($sourceType === 'sitemap') {
$parser = new SitemapParser($sourceUrl);
} else {
$parser = new FeedParser($sourceUrl);
}
$latestUrls = $parser->getLatestUrls(10);
?>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>结果 - 最新URL提取工具</title>
<style>
body { font-family: Arial, sans-serif; margin: 40px; background-color: #f5f5f5; }
.container { max-width: 800px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
h1 { color: #333; text-align: center; }
.back-link { margin-bottom: 20px; }
a { color: #007cba; text-decoration: none; }
a:hover { text-decoration: underline; }
.result { margin-top: 20px; }
.url-item { padding: 15px; border-bottom: 1px solid #eee; }
.url-item:last-child { border-bottom: none; }
.url-link { color: #007cba; text-decoration: none; font-weight: bold; }
.url-link:hover { text-decoration: underline; }
.lastmod { color: #666; font-size: 14px; margin-top: 5px; }
.no-results { color: #666; text-align: center; padding: 20px; }
.push-section { margin-top: 30px; padding: 20px; background: #f9f9f9; border-radius: 4px; }
.push-form { margin-top: 15px; }
.push-form input[type="text"] { width: 100%; padding: 8px; margin: 10px 0; }
.push-button { background-color: #333; color: white; padding: 10px 20px; border: none; border-radius: 4px; cursor: pointer; }
.push-button:hover { background-color: #555; }
.push-result { margin-top: 20px; padding: 15px; border-radius: 4px; }
.push-success { background: #edfaef; color: #008a20; }
.push-error { background: #fcf0f1; color: #d63638; }
.source-info { background: #eef7fa; padding: 10px; border-radius: 4px; margin-bottom: 20px; }
.description { color: #666; font-size: 14px; margin-top: 5px; }
</style>
</head>
<body>
<div class="container">
<div class="back-link">
<a href="<?php echo $_SERVER['PHP_SELF']; ?>">&larr; 返回</a>
</div>
<h1>最新更新的URL</h1>
<div class="source-info">
来源类型: <?php echo $sourceType === 'sitemap' ? 'Sitemap' : 'RSS/Atom Feed'; ?><br>
来源地址: <?php echo htmlspecialchars($sourceUrl); ?>
</div>
<div class="result">
<?php if (empty($latestUrls)): ?>
<div class="no-results">未找到任何URL</div>
<?php else: ?>
<?php foreach ($latestUrls as $index => $url): ?>
<div class="url-item">
<div>
<a href="<?php echo htmlspecialchars($url['loc']); ?>" target="_blank" class="url-link">
<?php echo htmlspecialchars($url['title'] ?? $url['loc']); ?>
</a>
</div>
<?php if (isset($url['lastmod']) || isset($url['timestamp'])): ?>
<div class="lastmod">
<?php
if (isset($url['lastmod'])) {
echo '最后修改时间: ' . date('Y-m-d H:i:s', strtotime($url['lastmod']));
} elseif (isset($url['timestamp'])) {
echo '发布时间: ' . date('Y-m-d H:i:s', $url['timestamp']);
}
?>
</div>
<?php endif; ?>
<?php if (isset($url['description'])): ?>
<div class="description">
<?php echo htmlspecialchars(mb_substr(strip_tags($url['description']), 0, 150)) . '...'; ?>
</div>
<?php endif; ?>
</div>
<?php endforeach; ?>
<?php endif; ?>
</div>
<?php if (!empty($latestUrls)): ?>
<div class="push-section">
<h3>推送到百度</h3>
<p>是否要将以上URL推送到百度站长平台?</p>
<form method="post" class="push-form">
<input type="hidden" name="action" value="push">
<input type="hidden" name="source_type" value="<?php echo htmlspecialchars($sourceType); ?>">
<?php foreach ($latestUrls as $url): ?>
<input type="hidden" name="urls[]" value="<?php echo htmlspecialchars($url['loc']); ?>">
<?php endforeach; ?>
<div class="form-group">
<label for="baidu_token">百度推送密钥:</label>
<input type="text" id="baidu_token" name="baidu_token" placeholder="请输入百度站长平台的推送密钥" required>
</div>
<button type="submit" class="push-button">推送到百度</button>
</form>
</div>
<?php endif; ?>
</div>
</body>
</html>
<?php
} catch (Exception $e) {
?>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>错误 - 最新URL提取工具</title>
<style>
body { font-family: Arial, sans-serif; margin: 40px; background-color: #f5f5f5; }
.container { max-width: 800px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
.back-link { margin-bottom: 20px; }
a { color: #007cba; text-decoration: none; }
a:hover { text-decoration: underline; }
.error { color: #d63638; background: #fcf0f1; padding: 15px; border-radius: 4px; }
</style>
</head>
<body>
<div class="container">
<div class="back-link">
<a href="<?php echo $_SERVER['PHP_SELF']; ?>">&larr; 返回重试</a>
</div>
<div class="error">
<strong>处理过程中发生错误:</strong><br>
<?php echo htmlspecialchars($e->getMessage()); ?>
</div>
</div>
</body>
</html>
<?php
}
}
function handlePush() {
$urls = $_POST['urls'] ?? [];
$token = $_POST['baidu_token'] ?? '';
$sourceType = $_POST['source_type'] ?? 'sitemap';
if (empty($token)) {
echo '<div class="error">请输入百度推送密钥</div>';
return;
}
if (empty($urls)) {
echo '<div class="error">没有可推送的URL</div>';
return;
}
// 提取站点域名
$site = '';
if (!empty($urls[0])) {
$parsedUrl = parse_url($urls[0]);
$site = $parsedUrl['scheme'] . '://' . $parsedUrl['host'];
}
$api = "http://data.zz.baidu.com/urls?site={$site}&token={$token}";
$ch = curl_init();
$options = array(
CURLOPT_URL => $api,
CURLOPT_POST => true,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_POSTFIELDS => implode("\n", $urls),
CURLOPT_HTTPHEADER => array('Content-Type: text/plain'),
);
curl_setopt_array($ch, $options);
$result = curl_exec($ch);
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
// 显示推送结果
?>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>推送结果 - 最新URL提取工具</title>
<style>
body { font-family: Arial, sans-serif; margin: 40px; background-color: #f5f5f5; }
.container { max-width: 800px; margin: 0 auto; background: white; padding: 30px; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
h1 { color: #333; text-align: center; }
.back-link { margin-bottom: 20px; }
a { color: #007cba; text-decoration: none; }
a:hover { text-decoration: underline; }
.push-result { margin-top: 20px; padding: 15px; border-radius: 4px; }
.push-success { background: #edfaef; color: #008a20; }
.push-error { background: #fcf0f1; color: #d63638; }
.url-list { margin-top: 20px; }
.url-item { padding: 5px 0; border-bottom: 1px solid #eee; }
.source-info { background: #eef7fa; padding: 10px; border-radius: 4px; margin-bottom: 20px; }
</style>
</head>
<body>
<div class="container">
<div class="back-link">
<a href="<?php echo $_SERVER['PHP_SELF']; ?>">&larr; 返回首页</a>
</div>
<h1>百度推送结果</h1>
<div class="source-info">
来源类型: <?php echo $sourceType === 'sitemap' ? 'Sitemap' : 'RSS/Atom Feed'; ?>
</div>
<?php if ($httpCode !== 200): ?>
<div class="push-result push-error">
<strong>推送失败:</strong><br>
HTTP状态码: <?php echo $httpCode; ?>
</div>
<?php else:
$response = json_decode($result, true);
?>
<div class="push-result <?php echo isset($response['error']) ? 'push-error' : 'push-success'; ?>">
<?php if (isset($response['error'])): ?>
<strong>推送失败:</strong><br>
错误代码: <?php echo htmlspecialchars($response['error']); ?><br>
错误信息: <?php echo htmlspecialchars($response['message']); ?>
<?php else: ?>
<strong>推送成功!</strong><br>
成功推送: <?php echo $response['success'] ?? 'N/A'; ?> 条URL<br>
剩余配额: <?php echo $response['remain'] ?? 'N/A'; ?>
<?php endif; ?>
</div>
<?php endif; ?>
<div class="url-list">
<h3>推送的URL列表:</h3>
<?php foreach ($urls as $url): ?>
<div class="url-item"><?php echo htmlspecialchars($url); ?></div>
<?php endforeach; ?>
</div>
</div>
</body>
</html>
<?php
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment