Skip to content

Instantly share code, notes, and snippets.

@TonyGao
Created May 16, 2017 07:58
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save TonyGao/bcae471f0cc05975f849bb706b69c1f6 to your computer and use it in GitHub Desktop.
Save TonyGao/bcae471f0cc05975f849bb706b69c1f6 to your computer and use it in GitHub Desktop.
<?php
namespace CrawlerBundle\Command;
use Symfony\Bundle\FrameworkBundle\Command\ContainerAwareCommand;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Console\Style\SymfonyStyle;
use Symfony\Component\Filesystem\Filesystem;
use Symfony\Component\Filesystem\Exception\IOExceptionInterface;
use Goutte\Client;
use GuzzleHttp\Exception\ClientException;
use GuzzleHttp\Exception\ServerException;
use GuzzleHttp\Exception\ConnectException;
use GuzzleHttp\Exception\RequestException;
use Symfony\Component\Debug\Exception\FatalErrorException;
class SymfonyCrawlerCommand extends ContainerAwareCommand
{
private $argument;
private $currentURI;
private $url;
protected function configure()
{
$this
->setName('symfony:crawler')
->setDescription('simple Symfony crawler')
->addArgument('argument', InputArgument::OPTIONAL, 'Argument description')
->addOption('baidu', null, InputOption::VALUE_NONE, 'Option description')
;
}
protected function execute(InputInterface $input, OutputInterface $output)
{
$io = new SymfonyStyle($input, $output);
$this->argument = $input->getArgument('argument');
if ($input->getOption('baidu')) {
$fs = new Filesystem();
try {
$fs->mkdir('baidu/'.$this->argument);
} catch (IOEXceptionInterface $e) {
$output->writeln("An error occurred while creating your directory at ".$e->getPath());
}
$startPage = 'http://image.baidu.com/search/wiseindex?tn=wiseindex&wiseps=1';
$client = new Client();
$crawler = $client->request('GET', $startPage);
$form = $crawler->selectButton('搜图片')->form();
$crawler = $client->submit($form, array('word' => $this->argument));
$link = $crawler->filter('.mb.ct.b > a')->first()->link();
$client->click($link);
$this->currentURI = $client->getHistory()->current()->getUri();
$output->writeln($this->currentURI);
$this->baiduCrawlerLoop($this->currentURI, 0, 1000, $output, $io);
}
$output->writeln('----------End line ---------');
}
protected function baiduCrawlerLoop($startPage, $pageNum, $count, OutputInterface $output, SymfonyStyle $io)
{
++$pageNum;
$image = null;
if( $pageNum <= $count )
{
$client = new Client();
try {
$crawler = $client->request('GET', $startPage);
} catch(ClientException $e) {
$io->caution(array("Fail when request this Page URL."));
} catch(ConnectException $e) {
$io->caution(array("Fail when request this Page URL. ConnectionException. "));
}
try {
$statusCode = $client->getResponse()->getStatus();
} catch( FatalErrorException $e ) {
$io->caution(array("Fail when getStatus."));
}
if( $statusCode == 200 ) {
$crawler->filter('.wm.lh > a')->first()->each(function ($node) {
$this->url = $node->attr('href');
});
$link = $crawler->selectLink('下一页')->link()->getUri();
$output->writeln("Previous Page URL: ". $link);
$output->writeln("Image URL: ". $this->url);
$io->success(array("Finish number: ".$pageNum." "."count: " .$count));
if($this->url) {
try {
$image = $client->getClient()->get($this->url, ['timeout' => 4])->getBody();
} catch(ClientException $e) {
$io->caution(array("Fail when request this Image URL. ClientException. "));
} catch(ServerException $e) {
$io->caution(array("Fail when request this Image URL. ServerException. "));
} catch(ConnectException $e) {
$io->caution(array("Fail when request this Image URL. ConnectionException. "));
} catch(RequestException $e) {
$io->caution(array("Fail when request this Image URL. RequestException. "));
}
}
if( $image != null ) {
$fp = fopen('baidu/'.$this->argument."/".md5($this->url),"w");
fwrite($fp, $image);
}
self::baiduCrawlerLoop($link, $pageNum, $count, $output, $io);
} else {
new \Exception("Not http status 200");
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment