Skip to content

Instantly share code, notes, and snippets.

@scrapingace
Last active November 15, 2022 20:14
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save scrapingace/72d35d3f813c23482bd361cacd61be9c to your computer and use it in GitHub Desktop.
Save scrapingace/72d35d3f813c23482bd361cacd61be9c to your computer and use it in GitHub Desktop.
Simple PHP Scraper Example
<?php
require 'vendor/autoload.php';
use GuzzleHttp\Client;
class BooksScraper {
function __construct() {
//setup base_uri
$this->base_uri = 'https://books.toscrape.com/';
// create Guzzle HTTP client
$this->client = new Client([
'base_uri' => $this->base_uri,
'timeout' => 300.0,
]);
}
function run() {
$this->load_html(); // Load HTML from URL
$this->load_dom(); // Load HTML to DOMDocument & DOMXpath to start reading nodes
$this->scrape(); // Scrape data from nodes as required
}
private function load_html() {
$response = $this->client->get('/');
$this->html = $response->getBody()->getContents();
}
private function load_dom() {
// throw Exception if no HTML content.
if ( !$this->html ) { throw new Exception('No HTML content.'); }
$this->doc = new DOMDocument;
@$this->doc->loadHTML($this->html);
$this->xpath = new DOMXpath($this->doc);
}
private function scrape() {
// Identify all book nodes
$elements = $this->xpath->query("//ol[@class='row']//li//article");
if ($elements->length == 0) {
throw new Exception('Elements not present for scraping.');
}
// Loop through each book node and find book data,
// then store data to $data array
$data = array ();
foreach ($elements as $key => $element) {
$item = $this->parse_node( $element );
array_push ( $data, $item );
}
// Write $data to csv
$this->to_csv($data);
}
private function parse_node($element) {
$item = array ();
$item['image_path'] = $this->base_uri . $this->extract(".//div[@class='image_container']//a//img/@src", $element);
$item['title'] = $this->extract(".//h3//a", $element);
$item['price'] = $this->extract(".//div[@class='product_price']//p[@class='price_color']", $element);
$item['availability'] = $this->extract(".//div[@class='product_price']//p[@class='instock availability']", $element);
$item['details_link'] = $this->base_uri . $this->extract(".//h3//a/@href", $element);
return $item;
}
private function extract($node, $element) {
// Get node text
$value = $this->xpath->query($node, $element)->item(0)->nodeValue;
return trim($value);
}
private function to_csv($data) {
$file = fopen ( './result.csv', 'a' );
// write headers
fputcsv ( $file, ['image_path', 'title', 'price', 'availability', 'details_link'] );
// write books data
foreach ($data as $item) { fputcsv ( $file, $item ); }
fclose ( $file );
}
}
$scraper = new BooksScraper();
$scraper->run();
echo 'Success!';
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment