Skip to content

Instantly share code, notes, and snippets.

@aaronpk
Created December 20, 2016 01:47
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aaronpk/786c84682056bcbb5ebdd3d4932d9199 to your computer and use it in GitHub Desktop.
Save aaronpk/786c84682056bcbb5ebdd3d4932d9199 to your computer and use it in GitHub Desktop.
Given an input URL, find the canonical URL after following redirects and looking at rel=canonical
<?php
if(!isset($_GET['url'])) {
?>
<form action="" method="get">
<input type="url" name="url">
<input type="submit" value="Go">
</form>
<?
die();
}
header('Content-type: text/plain');
// Given an input URL, find the canonical URL, after following redirects and looking for rel=canonical in the source HTML
$url = $_GET['url'];
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_MAXREDIRS, 10);
// Some sites don't like crawlers, so pretend to be a browser
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'
]);
$body = curl_exec($ch);
$final_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
if($final_url)
$url = $final_url;
// Check for rel=canonical
if($body) {
$dom = load_html($body);
if($dom) {
$links = $dom->getElementsByTagName('link');
foreach($links as $link) {
$rels = [];
if($link->hasAttribute('rel') && ($relAtt = $link->getAttribute('rel')) !== '') {
$rels = preg_split('/\s+/', trim($relAtt));
}
if(in_array('canonical', $rels)) {
$url = $link->getAttribute('href');
}
}
}
}
echo $url."\n";
function load_html($html) {
$dom = new DOMDocument;
libxml_use_internal_errors(true); // suppress parse errors and warnings
// Force interpreting this as UTF-8
@$dom->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_NOWARNING|LIBXML_NOERROR);
libxml_clear_errors();
return $dom;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment