Skip to content

Instantly share code, notes, and snippets.

@gtrias
Forked from joesexton00/Crawler
Created July 26, 2014 22:28
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save gtrias/c2d5af1da4b2672998c1 to your computer and use it in GitHub Desktop.
<?php
namespace Acme\Bundle\Command;
use Symfony\Bundle\FrameworkBundle\Command\ContainerAwareCommand;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\HttpFoundation\RedirectResponse;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Component\HttpKernel\Client;
use Symfony\Component\BrowserKit\Cookie;
use Symfony\Component\Security\Core\Authentication\Token\UsernamePasswordToken;
/**
* This class crawls the Acme site
*
* @author Joe Sexton <joe@webtipblog.com
*/
class SiteCrawlerCommand extends ContainerAwareCommand
{
/**
* @var OutputInterface
*/
protected $output;
/**
* @var Router
*/
protected $router;
/**
* @var EntityManager
*/
protected $entityManager;
/**
* @var string
*/
protected $domain = null;
/**
* @var string
*/
protected $username = null;
/**
* @var string
*/
protected $securityFirewall = null;
/**
* @var integer
*/
protected $searchLimit;
/**
* index routes containing these keywords only once
* @var array
*/
protected $ignoredRouteKeywords;
/**
* @var array
*/
protected $domainLinks = null;
/**
* @var array
*/
protected $linksToProcess = null;
/**
* Configure
*
* @author Joe Sexton <joe@webtipblog.com
*/
protected function configure()
{
$this
->setName( 'crawler:crawl' )
->setDescription( 'Crawls the Acme website.' )
->setDefinition(array(
new InputArgument( 'startingLink', InputArgument::REQUIRED, 'Link to start crawling' ),
new InputArgument( 'username', InputArgument::REQUIRED, 'Username' ),
new InputOption( 'limit', null, InputOption::VALUE_REQUIRED, 'Limit the number of links to process, prevents infinite crawling', 20 ),
new InputOption( 'security-firewall', null, InputOption::VALUE_REQUIRED, 'Firewall name', 'default_firewall' ),
new InputOption( 'ignore-duplicate-keyword', null, InputOption::VALUE_IS_ARRAY|InputOption::VALUE_REQUIRED, 'Index routes containing this keyword only one time (prevents infinite crawling of routes containng query parameters)', array() ),
))
->setHelp(<<<EOT
The <info>crawler:crawl</info> command crawls the Acme website:
<info>php app/console crawler:crawl <startingLink> <username></info>
EOT
);
}
/**
* Execute
*
* @author Joe Sexton <joe@webtipblog.com
* @param InputInterface $input
* @param OutputInterface $output
* @todo use product sitemap to crawl product pages
*/
protected function execute( InputInterface $input, OutputInterface $output )
{
// user input
$startingLink = $input->getArgument( 'startingLink' );
$this->domain = parse_url( $startingLink, PHP_URL_HOST );
$this->username = $input->getArgument( 'username' );
$this->searchLimit = $input->getOption( 'limit' );
$this->securityFirewall = $input->getOption( 'security-firewall' );
$this->ignoredRouteKeywords = $input->getOption( 'ignore-duplicate-keyword' );
$this->output = $output;
$this->router = $this->getContainer()->get( 'router' );
$this->entityManager = $this->getContainer()->get( 'doctrine.orm.entity_manager' );
// start
$output->writeln('
<info>A super-duper web crawler written by:
___ _____ _
|_ | / ___| | |
| | ___ ___ \ `--. _____ _| |_ ___ _ __
| |/ _ \ / _ \ `--. \/ _ \ \/ / __/ _ \| |_ \
/\__/ / (_) | __/ /\__/ / __/> <| || (_) | | | |
\____/ \___/ \___| \____/ \___/_/\_\\__\___/|_| |_|
</info>');
// config
$kernel = $this->_createKernel();
$client = $kernel->getContainer()->get( 'test.client' );
$this->_authenticate( $kernel, $client );
// start crawling
$output->writeln( sprintf( 'Dominating <comment>%s</comment>, starting at <comment>%s</comment>. At most, <comment>%s</comment> pages will be crawled.', $this->domain, $startingLink, $this->searchLimit ) );
// crawl starting link
$crawler = $client->request( 'GET', $startingLink );
// redirect if necessary
while ( $client->getResponse() instanceof RedirectResponse ) {
$crawler = $client->followRedirect();
}
$this->_processLinksOnPage( $crawler, $startingLink );
$index = 1;
// crawl links found
while ( ! empty( $this->linksToProcess ) && ++$index < $this->searchLimit ) {
$client->getHistory()->clear(); // prevent out of memory errors...
$url = array_pop( $this->linksToProcess );
$output->writeln( 'Processing: '.$url );
$crawler = $client->request( 'GET', $url );
// redirect if necessary
while ( $client->getResponse() instanceof RedirectResponse ) {
$crawler = $client->followRedirect();
}
$this->_processLinksOnPage( $crawler, $url );
}
// boom, done
$output->writeln( 'All Links Found:' );
foreach ( $this->domainLinks as $link => $linkDetails ) {
$output->writeln( ' '.$link.' : '.$linkDetails['route'] );
}
$output->writeln( $index.' links dominated, too easy...' );
}
/**
* Interact
*
* @author Joe Sexton <joe@webtipblog.com
* @param InputInterface $input
* @param OutputInterface $output
*/
protected function interact( InputInterface $input, OutputInterface $output )
{
if ( ! $input->getArgument( 'startingLink' ) ) {
$startingLink = $this->getHelper( 'dialog' )->askAndValidate(
$output,
'Please enter the link to start at(including the locale):',
function( $startingLink ) {
if ( empty( $startingLink ) ) {
throw new \Exception('starting link can not be empty');
}
return $startingLink;
}
);
$input->setArgument( 'startingLink', $startingLink );
}
if ( ! $input->getArgument( 'username' ) ) {
$username = $this->getHelper( 'dialog' )->askAndValidate(
$output,
'Please choose a username:',
function( $username ) {
if ( empty( $username ) ) {
throw new \Exception( 'Username can not be empty' );
}
return $username;
}
);
$input->setArgument( 'username', $username );
}
}
/**
* createKernel
*
* @author Joe Sexton <joe@webtipblog.com
* @return \AppKernel
*/
protected function _createKernel() {
$rootDir = $this->getContainer()->get( 'kernel' )->getRootDir();
require_once( $rootDir . '/AppKernel.php' );
$kernel = new \AppKernel( 'test', true );
$kernel->boot();
return $kernel;
}
/**
* authenticate with a user account to access secured urls
*
* @author Joe Sexton <joe@webtipblog.com
* @param AppKernel $kernel
* @param Client $client
*/
protected function _authenticate( $kernel, $client ) {
// however you retrieve a user in your application
$user = $this->entityManager->getRepository( 'Entity:User' )->findOneByUsername( $this->username );
$token = new UsernamePasswordToken( $user, null, $this->securityFirewall, $user->getRoles() );
// set session
$session = $client->getContainer()->get('session');
$session->set('_security_'.$this->securityFirewall, serialize($token));
$session->save();
// set cookie
$cookie = new Cookie($session->getName(), $session->getId());
$client->getCookieJar()->set($cookie);
}
/**
* get all links on the page as an array of urls
*
* @author Joe Sexton <joe@webtipblog.com
* @param Crawler $crawler
* @return array
*/
protected function _getLinksOnCurrentPage( Crawler $crawler ) {
$links = $crawler->filter( 'a' )->each( function ( Crawler $node, $i ) {
return $node->link()->getUri();
});
// remove outboundlinks
foreach ( $links as $key => $link ) {
$this->output->writeln( 'Link: '.$link );
$linkParts = parse_url( $link );
if ( empty( $linkParts['host'] ) || $linkParts['host'] !== $this->domain || $linkParts['scheme'] !== 'http' ) {
unset( $links[$key] );
}
}
return array_values( $links );
}
/**
* process all links on a page
*
* @author Joe Sexton <joe@webtipblog.com
* @param Crawler $crawler
* @param string $currentUrl
*/
protected function _processLinksOnPage( Crawler $crawler, $currentUrl ) {
$links = $this->_getLinksOnCurrentPage( $crawler );
// process each link
foreach ( $links as $key => $link ) {
$this->_processSingleLink( $link, $currentUrl );
}
}
/**
* process a single link
*
* @author Joe Sexton <joe@webtipblog.com
* @param string $link
* @param string $currentUrl
*/
protected function _processSingleLink( $link, $currentUrl ) {
if ( empty( $this->domainLinks[$link] ) ) {
// check for routes that should only be indexed once
// do this before we add the link to the domainLinks array since we check that array for duplicates...
if ( ! $this->_isDuplicateIgnoredRoute( $link ) ) {
$this->linksToProcess[] = $link;
}
// add details to the domainLinks array
$route = $this->router->match( parse_url( $link, PHP_URL_PATH ) );
$this->domainLinks[$link] = array(
'route' => ( ! empty( $route['_route'] ) ) ? $route['_route'] : '',
// any other details about a link you would like to know; ie. number of occurances, pages found on, etc...
);
}
}
/**
* routeIsInQueue
*
* @author Joe Sexton <joe@webtipblog.com
* @param string $routeName
* @return boolean
*/
protected function _routeIsInQueue( $routeName ) {
// check each existing link for a similar match
$allLinks = $this->domainLinks;
foreach ( $allLinks as $existingLink ) {
// does the url contain app name?
if ( $existingLink['route'] === $routeName ) {
return true;
}
}
return false;
}
/**
* isDuplicateIgnoredRoute
*
* @author Joe Sexton <joe@webtipblog.com
* @param string $newLink
* @return boolean
*/
protected function _isDuplicateIgnoredRoute( $newLink ) {
// $linkParts = parse_url( $newLink, PHP_URL_PATH );
$route = $this->router->match( parse_url( $newLink, PHP_URL_PATH ) );
$routeName = ( ! empty( $route['_route'] ) ) ? $route['_route'] : '';
// if the route name contains an ignored route keyword, check if it's in the queue of links to process
foreach ( $this->ignoredRouteKeywords as $keyword ) {
$keyword = '/'.$keyword.'/'; // add delimiters
if ( preg_match( $keyword, $routeName ) === 1 ) {
return $this->_routeIsInQueue( $routeName );
}
}
return false;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment