Skip to content

Instantly share code, notes, and snippets.

@jimbojsb

jimbojsb/crawl.php

Created Mar 24, 2015
Embed
What would you like to do?
Entertainment.com Crawler
<?php
/**
{
"require": {
"symfony/dom-crawler": "2.*",
"symfony/css-selector": "2.*"
}
}
*/
require_once __DIR__ . '/../vendor/autoload.php';
use \Symfony\Component\DomCrawler\Crawler;
$books = [];
$context = stream_context_create([
"http" => [
"user_agent" => "Offers.com Entertainment Bot (bot@offers.com)"
]
]);
$baseUrl = "https://www.entertainment.com";
$handle = fopen($baseUrl . "/coupon/view_all_products.cmd", 'r', false, $context);
$rootHtml = stream_get_contents($handle);
$rootDomCrawler = new Crawler($rootHtml);
$books = [];
$rootDomCrawler->filter(".region")->each(function(Crawler $node) use ($baseUrl, $context, &$books) {
$state = $node->filter(".region-toggle")->first()->extract("_text")[0];
echo "Processing $state\n";
$node->filter(".product")->each(function(Crawler $node) use ($baseUrl, $context, &$books) {
$book["name"] = $node->filter(".locations")->first()->extract(["_text"])[0];
echo "Loading " . $book["name"] . "\n";
$book["url"] = $node->filter(".getDetails")->first()->extract(["href"])[0];
$book["retail_price"] = trim(str_replace("$", "", strip_tags($node->filter(".retailPrice")->first()->extract(["_text"])[0])));
$book["sale_price"] = str_replace("$", "", $node->filter(".yourPrice")->first()->extract(["_text"])[0]);
$bookUrl = $baseUrl . $book["url"];
$handle = fopen($bookUrl, 'r', false, $context);
$bookHtml = stream_get_contents($handle);
if ($bookHtml) {
$crawler = new Crawler($bookHtml);
$crawler->filter(".col-sm-6.col-md-4 .thumbnail")->each(function(Crawler $node, $i) use (&$book) {
if ($i > 0) { // only way i could find to select only the right .thumbnails due to invalid html
$feature["type"] = $node->filter(".feature-type")->first()->extract(["_text"])[0];
$feature["name"] = $node->filter("h3")->first()->extract(["_text"])[0];
$feature["offer"] = $node->filter("h4")->first()->extract(["_text"])[0];
$feature["detail"] = $node->filter(".featured-details")->first()->extract(["_text"])[0];
$feature["location"] = $node->filter(".featured-location")->first()->extract(["_text"])[0];
$book["features"][] = $feature;
}
});
$crawler->filter(".panel")->each(function(Crawler $node, $i) use (&$book, $crawler) {
$area["name"] = $node->filter("a")->extract(["_text"])[0];
$modalId = "modal" . ($i + 1);
$crawler->filter("#$modalId")->each(function(Crawler $node) use (&$area, $book) {
$area["cities"] = $node->filter("li.cities")->extract(["_text"]);
});
$area["companies"] = $node->filter("li")->extract(["_text"]); // no exception here because they don't all have them
$book["areas"][] = $area;
});
if (stripos($bookHtml, 'free shipping')) {
$book["free_shipping"] = true;
}
}
$books[] = $book;
});
});
print_r($books);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.