Skip to content

Instantly share code, notes, and snippets.

@madeingnecca
Last active February 5, 2020 22:06
Show Gist options
  • Save madeingnecca/cdae7df746897f73d02325fb44a85be9 to your computer and use it in GitHub Desktop.
Save madeingnecca/cdae7df746897f73d02325fb44a85be9 to your computer and use it in GitHub Desktop.
Crawl a sitemap.xml file using php
<?php
if (!isset($argv)) {
die("This script only works in cli mode.\n");
}
if (!isset($argv[1])) {
die("Usage: " . $argv[0] . " <SITE_URL>\n");
}
function http_request($url, $method = 'GET', $timeout = NULL) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
if (isset($timeout)) {
curl_setopt($ch, CURLOPT_TIMEOUT, $timeout);
}
if (isset($method) && $method === 'HEAD') {
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'HEAD');
curl_setopt($ch, CURLOPT_NOBODY, TRUE);
}
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36');
$body = curl_exec($ch);
$code = curl_getinfo($ch, CURLINFO_RESPONSE_CODE);
curl_close($ch);
return array(
'code' => $code,
'body' => $body,
);
}
$sitemap_url = $argv[1] . '/sitemap.xml';
fetch_sitemap($sitemap_url);
function fetch_sitemap($url, $sitemap_index = 0, $total_sitemaps = 1) {
echo "Visiting sitemap " . ($sitemap_index + 1) . "/" . $total_sitemaps . " at $url\n";
$sitemap_response = http_request($url, 'GET');
if ($sitemap_response['code'] != 200) {
die('GET ' . $sitemap_response['code'] . ' ' . $url);
}
$xml = new SimpleXMLElement($sitemap_response['body']);
$sitemaps = $xml->sitemap;
if (!empty($sitemaps)) {
$subsitemap_index = 0;
$subsitemap_count = count($sitemaps);
foreach ($sitemaps as $sitemap) {
fetch_sitemap((string) $sitemap->loc, $sitemap_index, $subsitemap_count);
$subsitemap_index++;
}
}
$urls = array();
foreach ($xml->url as $url_node) {
$urls[] = (string) $url_node->loc;
}
sort($urls);
$method = 'HEAD';
$url_index = 0;
$url_total = count($urls);
foreach ($urls as $url) {
$time_start = time();
$response = http_request($url, $method, 10);
$time_end = time();
echo ($url_index + 1) . '/' . $url_total . ' ' . $method . ' ' . $response['code'] . ' (' . ($time_end - $time_start) . 's) ' . $url . "\n";
$url_index++;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment