Last active
March 16, 2017 02:09
-
-
Save yawo/ccd82e4ea3e07cd5722e9df985adc930 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
$start = microtime(true); | |
use Symfony\Component\DomCrawler\Crawler; | |
use VDB\Spider\Discoverer\XPathExpressionDiscoverer; | |
use VDB\Spider\Downloader\Downloader; | |
use VDB\Spider\QueueManager\InMemoryQueueManager; | |
use VDB\Spider\Spider; | |
use Guzzle\Http\Client as GuzzleClient; | |
use VDB\Spider\RequestHandler\GuzzleRequestHandler; | |
use VDB\Spider\StatsHandler; | |
require __DIR__ . '/../vendor/autoload.php'; | |
require './FileSerializedResourcePersistenceHandler.php'; | |
//$example="http://www.ysl.com/fr/shop-product/homme/pret-a-porter-t-shirts-et-jersey-t-shirt-a-manches-courtes-a-imprime-moonlight-noir_cod37978245bm.html#section=men"; | |
//$modelfabriccolorCodeSample="420092AQS001003"; | |
//$yooxCodeSample="11000574HF"; | |
//model=style, fabric=material, color=color | |
$baseUrl = 'http://www.ysl.com'; | |
$csvdir = __DIR__ . '/../csv'; | |
$xpathDiscoverer = new XPathExpressionDiscoverer("//a[contains(@href,'/shop-product/')]"); | |
$persistenceHandler =new FileSerializedResourcePersistenceHandler(__DIR__ . '/results'); | |
$statsHandler = new StatsHandler(); | |
$client = getClient(); | |
$requestHandler = new GuzzleRequestHandler(); | |
$requestHandler->setClient($client); | |
$countrylistUrls=getCountryList($client,$baseUrl); | |
mkdir($csvdir, 0777, true); | |
foreach($countrylistUrls as $countryUrl){ | |
if(strlen($countryUrl)==3){ $countryUrl=$baseUrl.$countryUrl; } | |
$country=explode("/", $countryUrl)[3]; | |
$spiderId = "ysl_".$country.date("Ymd_his"); | |
echo "\nCrawling $countryUrl"; | |
$spider = getSpider($client,$countryUrl,$spiderId,$requestHandler,$xpathDiscoverer,$statsHandler); | |
$result = $spider->crawl(); | |
writeCrawledData($spider,$country,$csvdir); | |
deleteStoredFiles($spiderId); | |
} | |
$totalTime = round(microtime(true) - $start, 2); | |
echo "\n\n-----\nDONE $totalTime s \n"; | |
/////////////////////////////// END OF SCRIPT ////////////////////////////////// | |
function writeCrawledData($spider,$country,$csvdir){ | |
$downloaded = $spider->getDownloader()->getPersistenceHandler(); | |
$productPattern = '/https?:\/\/www.ysl.com\/..\/shop-product\/.*_cod(.*)\.html.*/'; | |
$csv = fopen($csvdir.'/extract'.date("Ymd_his").'_'.$country.'.csv','w'); | |
fwrite($csv,"modelFabricColor;model;fabric;color;imgSrc;editorialDescription;itemDescription;modelName;composition\n"); | |
foreach ($downloaded as $resource) { | |
$uri = $resource->getUri(); | |
if(preg_match($productPattern, $uri, $m)){ | |
$domCrawler = $resource->getCrawler(); | |
$modelFabricColor= $domCrawler->filterXpath('//*[@id="descriptionWrapper"]//div[@class="modelFldWrapper"]/div[@class="modelFabricColor"]/span[@class="value"]')->text(); | |
list($model,$fabric,$color) = str_split($modelFabricColor,5); | |
$imgSrc = $domCrawler->filterXpath('//*[@id="mainImageWrapper"]/div[1]/img ')->attr('src'); | |
$editorialDescription = $domCrawler->filterXpath('//*[@id="descriptionWrapper"]/h2')->text(); | |
$itemDescription = implode('<br>',$domCrawler->filterXpath('//*[@id="descriptionWrapper"]//li[contains(@class,"itemSingleDesc")]/span')->each(function ($node, $i) { | |
return $node->text(); | |
})); | |
$modelName=$domCrawler->filterXpath('//*[@id="itemInfo"]//span[contains(@class,"modelName")]')->text(); | |
$composition = $domCrawler->filterXpath('//*[@id="descriptionWrapper"]/div[contains(@class,"moreDetails")]/span[@class="composition"]')->text(); | |
printcsv($csv,$modelFabricColor); | |
printcsv($csv,$model); | |
printcsv($csv,$fabric); | |
printcsv($csv,$color); | |
printcsv($csv,$imgSrc); | |
printcsv($csv,$editorialDescription); | |
printcsv($csv,$itemDescription); | |
printcsv($csv,$modelName); | |
fwrite($csv,utf8_decode(trim($composition))); | |
fwrite($csv,PHP_EOL); | |
} | |
} | |
fclose($csv); | |
// Report | |
/*echo "\n\nSPIDER ID: " . $statsHandler->getSpiderId(); | |
echo "\n ENQUEUED: " . count($statsHandler->getQueued()); | |
echo "\n SKIPPED: " . count($statsHandler->getFiltered()); | |
echo "\n FAILED: " . count($statsHandler->getFailed()); | |
echo "\n PERSISTED: " . count($statsHandler->getPersisted()); | |
// With the information from some of plugins and listeners, we can determine some metrics | |
$peakMem = round(memory_get_peak_usage(true) / 1024 / 1024, 2); | |
$totalTime = round(microtime(true) - $start, 2); | |
//$totalDelay = round($politenessPolicyEventListener->totalDelay / 1000 / 1000, 2); | |
echo "\n\nMETRICS:"; | |
echo "\n PEAK MEM USAGE: " . $peakMem . 'MB'; | |
echo "\n TOTAL TIME: " . $totalTime . 's'; | |
//echo "\n REQUEST TIME: " . $timerMiddleware->getTotal() . 's'; | |
//echo "\n POLITENESS WAIT TIME: " . $totalDelay . 's'; | |
//echo "\n PROCESSING TIME: " . ($totalTime - $timerMiddleware->getTotal() - $totalDelay) . 's'; | |
// Finally we could start some processing on the downloaded resources*/ | |
} | |
function printcsv($file,$txt){ | |
fwrite($file, utf8_decode(trim($txt)).";"); | |
} | |
function getCountryList($httpclient,$baseUrl){ | |
$resp = $httpclient->get($baseUrl)->send()->getBody(true); | |
$crawler = new Crawler($resp); | |
$lst = $crawler->filterXpath('//*[@id="chooseYourCountryLayer"]/div[@class="countryWrapper"]/div[@class="countriesContainer"]//li/a ')->each(function ($node, $i) { | |
return $node->attr('href'); | |
}); | |
return $lst; | |
} | |
function getClient(){ | |
$headers= [ | |
'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' | |
,'Host' => 'www.ysl.com' | |
,'Accept-Encoding'=> 'gzip, deflate, sdch' | |
,'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | |
,'Upgrade-Insecure-Requests' => 1 | |
,'Accept-Language' => 'fr-FR,fr;q=0.8,en-US;q=0.6,en;q=0.4,de;q=0.2' | |
,'Proxy-Connection' => 'keep-alive' | |
]; | |
$client = new GuzzleClient(); | |
$client->setDefaultOption('headers',$headers); | |
$client->setDefaultOption('verify',false); | |
$client->setDefaultOption('allow_redirects',true); | |
//$client->setDefaultOption('debug',true); | |
$client->setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'); | |
return $client; | |
} | |
function getSpider($client,$baseUrl,$spiderId,$requestHandler,$xpathDiscoverer,$statsHandler){ | |
$queueManager = new InMemoryQueueManager(); | |
$persistenceHandler =new FileSerializedResourcePersistenceHandler(__DIR__ . '/results'); | |
$downloader = new Downloader(); | |
$downloader->setRequestHandler($requestHandler); | |
//$downloader->setDownloadLimit(30); | |
$downloader->setPersistenceHandler($persistenceHandler); | |
$spider = new Spider($baseUrl,$spiderId); //http://www.ysl.com/fr');http://www.dmoz.org/Computers/Internet | |
$spider->setDownloader($downloader); | |
$spider->getDiscovererSet()->maxDepth = 5; | |
//$queueManager->setTraversalAlgorithm(InMemoryQueueManager::ALGORITHM_BREADTH_FIRST); | |
$spider->setQueueManager($queueManager); | |
$spider->getDiscovererSet()->set($xpathDiscoverer);//"//a[contains(@href,'/shop-product/')]")); | |
// $spider->getDispatcher()->addSubscriber($statsHandler); | |
return $spider; | |
} | |
function deleteStoredFiles($spiderId){ | |
$dir=__DIR__ . '/results/'.$spiderId; | |
array_map('unlink', glob($dir.'/*.*')); | |
rmdir($dir); | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment