Skip to content

Instantly share code, notes, and snippets.

@mducharme
Created November 24, 2017 15:05
Show Gist options
  • Save mducharme/1fb301b955c2eba16471c30f7b43a5d9 to your computer and use it in GitHub Desktop.
Save mducharme/1fb301b955c2eba16471c30f7b43a5d9 to your computer and use it in GitHub Desktop.
<?php
namespace Charcoal\Admin\Script\Tools;
use Exception;
use Psr\Http\Message\RequestInterface;
use Psr\Http\Message\ResponseInterface;
use Pimple\Container;
use Goutte\Client as GoutteClient;
use GuzzleHttp\Client as GuzzleClient;
use GuzzleHttp\TransferStats;
use Charcoal\Admin\AdminScript;
/**
*
*/
class CheckLinksScript extends AdminScript
{
/**
* @var string
*/
private $startUrl;
private $parsedStartUrl;
/**
* @var integer
*/
private $maxLevel;
/**
* @var string[]
*/
private $processedUrls = [];
/**
* @var GuzzleClient
*/
private $guzzleClient;
/**
* @var GoutteClient
*/
private $goutteClient;
public function __construct($data = null)
{
parent::__construct($data);
$this->guzzleClient = new GuzzleClient();
$this->goutteClient = new GoutteClient();
$this->goutteClient->setClient($this->guzzleClient);
}
/**
* @return array
*/
public function defaultArguments()
{
$arguments = [
'url' => [
'longPrefix' => 'url',
'description' => 'Object type',
'defaultValue' => $this->baseUrl()
],
'output-dir' => [
'longPrefix' => 'output-dir',
'description' => 'Output path (relative) where the static files will be stored.',
'defaultValue' => 'www/static/'
],
'max-level' => [
'longPrefix' => 'max-level',
'description' => 'Maximum recursive level.',
'defaultValue' => 2
]
];
$arguments = array_merge(parent::defaultArguments(), $arguments);
return $arguments;
}
public function run(RequestInterface $request, ResponseInterface $response)
{
unset($request);
$climate = $this->climate();
$climate->arguments->parse();
$this->startUrl = rtrim($climate->arguments->get('url'), '/').'/';
$this->maxLevel = $climate->arguments->get('max-level');
$climate->underline()->out(
sprintf('Check Broken Links ("%s")', $this->startUrl)
);
$this->checkUrl($this->startUrl);
$this->retrieveLinks($this->startUrl, 0);
return $response;
}
private function checkUrl($url)
{
$url = $this->absoluteLink($url);
if (in_array($url, $this->processedUrls)) {
return;
}
$this->processedUrls[] = $url;
try {
$response = $this->guzzleClient->request('GET', $url, [
'on_stats' => function (TransferStats $stats) {
if ($stats->hasResponse()) {
$code = $stats->getResponse()->getStatusCode();
if ($code > 200 && $code < 400) {
$this->climate()->orange(sprintf('[%s] %s - %sms',
$code,
$stats->getEffectiveUri(),
number_format(1000 * $stats->getTransferTime(), 0)
));
} else if ($code >= 400) {
$this->climate()->error(sprintf('[%s] %s - %sms',
$code,
$stats->getEffectiveUri(),
number_format(1000 * $stats->getTransferTime(), 0)
));
} else {
$this->climate()->out(sprintf('[%s] %s - %sms',
$code,
$stats->getEffectiveUri(),
number_format(1000 * $stats->getTransferTime(), 0)
));
}
}
}
]);
} catch (Exception $e) {
$this->climate()->error($e->getMessage().' - '.$url);
}
}
/**
* @param string $url The URL to retrieve links from.
* @param integer $level The current level.
* @return void
*/
private function retrieveLinks($url, $level)
{
$crawler = $this->goutteClient->request('GET', $url);
$crawler->filter('a')->each(function($item) use ($level) {
$href = $item->attr('href');
$this->checkUrl($href);
if ($this->isInternalLink($href) && ($level < $this->maxLevel)) {
$this->retrieveLinks($href, ++$level);
}
});
}
/**
* @return boolean
*/
private function isInternalLink($url)
{
$parsed = parse_url($url);
if (!isset($parsed['host'])) {
return true;
}
if ($parsed['host'] === $this->parsedStartUrl['host']) {
return true;
}
return false;
}
private function absoluteLink($url)
{
if (strstr($url, 'http') === false) {
return $this->startUrl.ltrim($url, '/');
} else {
return $url;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment