Created
May 16, 2017 07:58
-
-
Save TonyGao/bcae471f0cc05975f849bb706b69c1f6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace CrawlerBundle\Command; | |
use Symfony\Bundle\FrameworkBundle\Command\ContainerAwareCommand; | |
use Symfony\Component\Console\Input\InputArgument; | |
use Symfony\Component\Console\Input\InputInterface; | |
use Symfony\Component\Console\Input\InputOption; | |
use Symfony\Component\Console\Output\OutputInterface; | |
use Symfony\Component\Console\Style\SymfonyStyle; | |
use Symfony\Component\Filesystem\Filesystem; | |
use Symfony\Component\Filesystem\Exception\IOExceptionInterface; | |
use Goutte\Client; | |
use GuzzleHttp\Exception\ClientException; | |
use GuzzleHttp\Exception\ServerException; | |
use GuzzleHttp\Exception\ConnectException; | |
use GuzzleHttp\Exception\RequestException; | |
use Symfony\Component\Debug\Exception\FatalErrorException; | |
class SymfonyCrawlerCommand extends ContainerAwareCommand | |
{ | |
private $argument; | |
private $currentURI; | |
private $url; | |
protected function configure() | |
{ | |
$this | |
->setName('symfony:crawler') | |
->setDescription('simple Symfony crawler') | |
->addArgument('argument', InputArgument::OPTIONAL, 'Argument description') | |
->addOption('baidu', null, InputOption::VALUE_NONE, 'Option description') | |
; | |
} | |
protected function execute(InputInterface $input, OutputInterface $output) | |
{ | |
$io = new SymfonyStyle($input, $output); | |
$this->argument = $input->getArgument('argument'); | |
if ($input->getOption('baidu')) { | |
$fs = new Filesystem(); | |
try { | |
$fs->mkdir('baidu/'.$this->argument); | |
} catch (IOEXceptionInterface $e) { | |
$output->writeln("An error occurred while creating your directory at ".$e->getPath()); | |
} | |
$startPage = 'http://image.baidu.com/search/wiseindex?tn=wiseindex&wiseps=1'; | |
$client = new Client(); | |
$crawler = $client->request('GET', $startPage); | |
$form = $crawler->selectButton('搜图片')->form(); | |
$crawler = $client->submit($form, array('word' => $this->argument)); | |
$link = $crawler->filter('.mb.ct.b > a')->first()->link(); | |
$client->click($link); | |
$this->currentURI = $client->getHistory()->current()->getUri(); | |
$output->writeln($this->currentURI); | |
$this->baiduCrawlerLoop($this->currentURI, 0, 1000, $output, $io); | |
} | |
$output->writeln('----------End line ---------'); | |
} | |
protected function baiduCrawlerLoop($startPage, $pageNum, $count, OutputInterface $output, SymfonyStyle $io) | |
{ | |
++$pageNum; | |
$image = null; | |
if( $pageNum <= $count ) | |
{ | |
$client = new Client(); | |
try { | |
$crawler = $client->request('GET', $startPage); | |
} catch(ClientException $e) { | |
$io->caution(array("Fail when request this Page URL.")); | |
} catch(ConnectException $e) { | |
$io->caution(array("Fail when request this Page URL. ConnectionException. ")); | |
} | |
try { | |
$statusCode = $client->getResponse()->getStatus(); | |
} catch( FatalErrorException $e ) { | |
$io->caution(array("Fail when getStatus.")); | |
} | |
if( $statusCode == 200 ) { | |
$crawler->filter('.wm.lh > a')->first()->each(function ($node) { | |
$this->url = $node->attr('href'); | |
}); | |
$link = $crawler->selectLink('下一页')->link()->getUri(); | |
$output->writeln("Previous Page URL: ". $link); | |
$output->writeln("Image URL: ". $this->url); | |
$io->success(array("Finish number: ".$pageNum." "."count: " .$count)); | |
if($this->url) { | |
try { | |
$image = $client->getClient()->get($this->url, ['timeout' => 4])->getBody(); | |
} catch(ClientException $e) { | |
$io->caution(array("Fail when request this Image URL. ClientException. ")); | |
} catch(ServerException $e) { | |
$io->caution(array("Fail when request this Image URL. ServerException. ")); | |
} catch(ConnectException $e) { | |
$io->caution(array("Fail when request this Image URL. ConnectionException. ")); | |
} catch(RequestException $e) { | |
$io->caution(array("Fail when request this Image URL. RequestException. ")); | |
} | |
} | |
if( $image != null ) { | |
$fp = fopen('baidu/'.$this->argument."/".md5($this->url),"w"); | |
fwrite($fp, $image); | |
} | |
self::baiduCrawlerLoop($link, $pageNum, $count, $output, $io); | |
} else { | |
new \Exception("Not http status 200"); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment