Skip to content

Instantly share code, notes, and snippets.

@yawo
Last active March 16, 2017 02:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yawo/ccd82e4ea3e07cd5722e9df985adc930 to your computer and use it in GitHub Desktop.
Save yawo/ccd82e4ea3e07cd5722e9df985adc930 to your computer and use it in GitHub Desktop.
<?php
$start = microtime(true);
use Symfony\Component\DomCrawler\Crawler;
use VDB\Spider\Discoverer\XPathExpressionDiscoverer;
use VDB\Spider\Downloader\Downloader;
use VDB\Spider\QueueManager\InMemoryQueueManager;
use VDB\Spider\Spider;
use Guzzle\Http\Client as GuzzleClient;
use VDB\Spider\RequestHandler\GuzzleRequestHandler;
use VDB\Spider\StatsHandler;
require __DIR__ . '/../vendor/autoload.php';
require './FileSerializedResourcePersistenceHandler.php';
//$example="http://www.ysl.com/fr/shop-product/homme/pret-a-porter-t-shirts-et-jersey-t-shirt-a-manches-courtes-a-imprime-moonlight-noir_cod37978245bm.html#section=men";
//$modelfabriccolorCodeSample="420092AQS001003";
//$yooxCodeSample="11000574HF";
//model=style, fabric=material, color=color
$baseUrl = 'http://www.ysl.com';
$csvdir = __DIR__ . '/../csv';
$xpathDiscoverer = new XPathExpressionDiscoverer("//a[contains(@href,'/shop-product/')]");
$persistenceHandler =new FileSerializedResourcePersistenceHandler(__DIR__ . '/results');
$statsHandler = new StatsHandler();
$client = getClient();
$requestHandler = new GuzzleRequestHandler();
$requestHandler->setClient($client);
$countrylistUrls=getCountryList($client,$baseUrl);
mkdir($csvdir, 0777, true);
foreach($countrylistUrls as $countryUrl){
if(strlen($countryUrl)==3){ $countryUrl=$baseUrl.$countryUrl; }
$country=explode("/", $countryUrl)[3];
$spiderId = "ysl_".$country.date("Ymd_his");
echo "\nCrawling $countryUrl";
$spider = getSpider($client,$countryUrl,$spiderId,$requestHandler,$xpathDiscoverer,$statsHandler);
$result = $spider->crawl();
writeCrawledData($spider,$country,$csvdir);
deleteStoredFiles($spiderId);
}
$totalTime = round(microtime(true) - $start, 2);
echo "\n\n-----\nDONE $totalTime s \n";
/////////////////////////////// END OF SCRIPT //////////////////////////////////
function writeCrawledData($spider,$country,$csvdir){
$downloaded = $spider->getDownloader()->getPersistenceHandler();
$productPattern = '/https?:\/\/www.ysl.com\/..\/shop-product\/.*_cod(.*)\.html.*/';
$csv = fopen($csvdir.'/extract'.date("Ymd_his").'_'.$country.'.csv','w');
fwrite($csv,"modelFabricColor;model;fabric;color;imgSrc;editorialDescription;itemDescription;modelName;composition\n");
foreach ($downloaded as $resource) {
$uri = $resource->getUri();
if(preg_match($productPattern, $uri, $m)){
$domCrawler = $resource->getCrawler();
$modelFabricColor= $domCrawler->filterXpath('//*[@id="descriptionWrapper"]//div[@class="modelFldWrapper"]/div[@class="modelFabricColor"]/span[@class="value"]')->text();
list($model,$fabric,$color) = str_split($modelFabricColor,5);
$imgSrc = $domCrawler->filterXpath('//*[@id="mainImageWrapper"]/div[1]/img ')->attr('src');
$editorialDescription = $domCrawler->filterXpath('//*[@id="descriptionWrapper"]/h2')->text();
$itemDescription = implode('<br>',$domCrawler->filterXpath('//*[@id="descriptionWrapper"]//li[contains(@class,"itemSingleDesc")]/span')->each(function ($node, $i) {
return $node->text();
}));
$modelName=$domCrawler->filterXpath('//*[@id="itemInfo"]//span[contains(@class,"modelName")]')->text();
$composition = $domCrawler->filterXpath('//*[@id="descriptionWrapper"]/div[contains(@class,"moreDetails")]/span[@class="composition"]')->text();
printcsv($csv,$modelFabricColor);
printcsv($csv,$model);
printcsv($csv,$fabric);
printcsv($csv,$color);
printcsv($csv,$imgSrc);
printcsv($csv,$editorialDescription);
printcsv($csv,$itemDescription);
printcsv($csv,$modelName);
fwrite($csv,utf8_decode(trim($composition)));
fwrite($csv,PHP_EOL);
}
}
fclose($csv);
// Report
/*echo "\n\nSPIDER ID: " . $statsHandler->getSpiderId();
echo "\n ENQUEUED: " . count($statsHandler->getQueued());
echo "\n SKIPPED: " . count($statsHandler->getFiltered());
echo "\n FAILED: " . count($statsHandler->getFailed());
echo "\n PERSISTED: " . count($statsHandler->getPersisted());
// With the information from some of plugins and listeners, we can determine some metrics
$peakMem = round(memory_get_peak_usage(true) / 1024 / 1024, 2);
$totalTime = round(microtime(true) - $start, 2);
//$totalDelay = round($politenessPolicyEventListener->totalDelay / 1000 / 1000, 2);
echo "\n\nMETRICS:";
echo "\n PEAK MEM USAGE: " . $peakMem . 'MB';
echo "\n TOTAL TIME: " . $totalTime . 's';
//echo "\n REQUEST TIME: " . $timerMiddleware->getTotal() . 's';
//echo "\n POLITENESS WAIT TIME: " . $totalDelay . 's';
//echo "\n PROCESSING TIME: " . ($totalTime - $timerMiddleware->getTotal() - $totalDelay) . 's';
// Finally we could start some processing on the downloaded resources*/
}
function printcsv($file,$txt){
fwrite($file, utf8_decode(trim($txt)).";");
}
function getCountryList($httpclient,$baseUrl){
$resp = $httpclient->get($baseUrl)->send()->getBody(true);
$crawler = new Crawler($resp);
$lst = $crawler->filterXpath('//*[@id="chooseYourCountryLayer"]/div[@class="countryWrapper"]/div[@class="countriesContainer"]//li/a ')->each(function ($node, $i) {
return $node->attr('href');
});
return $lst;
}
function getClient(){
$headers= [
'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
,'Host' => 'www.ysl.com'
,'Accept-Encoding'=> 'gzip, deflate, sdch'
,'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
,'Upgrade-Insecure-Requests' => 1
,'Accept-Language' => 'fr-FR,fr;q=0.8,en-US;q=0.6,en;q=0.4,de;q=0.2'
,'Proxy-Connection' => 'keep-alive'
];
$client = new GuzzleClient();
$client->setDefaultOption('headers',$headers);
$client->setDefaultOption('verify',false);
$client->setDefaultOption('allow_redirects',true);
//$client->setDefaultOption('debug',true);
$client->setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36');
return $client;
}
function getSpider($client,$baseUrl,$spiderId,$requestHandler,$xpathDiscoverer,$statsHandler){
$queueManager = new InMemoryQueueManager();
$persistenceHandler =new FileSerializedResourcePersistenceHandler(__DIR__ . '/results');
$downloader = new Downloader();
$downloader->setRequestHandler($requestHandler);
//$downloader->setDownloadLimit(30);
$downloader->setPersistenceHandler($persistenceHandler);
$spider = new Spider($baseUrl,$spiderId); //http://www.ysl.com/fr');http://www.dmoz.org/Computers/Internet
$spider->setDownloader($downloader);
$spider->getDiscovererSet()->maxDepth = 5;
//$queueManager->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST);
$spider->setQueueManager($queueManager);
$spider->getDiscovererSet()->set($xpathDiscoverer);//"//a[contains(@href,'/shop-product/')]"));
// $spider->getDispatcher()->addSubscriber($statsHandler);
return $spider;
}
function deleteStoredFiles($spiderId){
$dir=__DIR__ . '/results/'.$spiderId;
array_map('unlink', glob($dir.'/*.*'));
rmdir($dir);
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment