Skip to content

Instantly share code, notes, and snippets.

@boombatower
Created September 27, 2015 05:15
Show Gist options
  • Save boombatower/3feeec99d9b1de3f1734 to your computer and use it in GitHub Desktop.
Save boombatower/3feeec99d9b1de3f1734 to your computer and use it in GitHub Desktop.
Web scraper for vbulletin. Create pdf of pages and whole thread and downloads all attached images.
<?php
const BASE = '';
const COUNT = 1;
$pages = [];
for ($i = 1; $i <= COUNT; $i++) {
$suffix = $i > 1 ? '/page' . $i : '';
$pages[] = $page = 'page' . $i . '.pdf';
$url = BASE . $suffix;
execute('wkhtmltopdf ' . escapeshellarg($url) . ' ' . escapeshellarg($page));
print_section('Extracting images...');
if ($xml = load($url)) {
if (!is_dir('images')) mkdir('images');
$key = 1;
foreach ($xml->xpath('//div[@id="postlist"]//div[@class="postrow"]//img[contains(@src, "attachment")]') as $image) {
$src = (string) $image['src'];
$out = 'images/page' . $i . '.' . $key++ . '.jpg';
execute('wget -O ' . escapeshellarg($out) . ' ' . escapeshellarg($src));
}
}
else {
echo "FAILED TO LOAD $url\n";
}
}
execute('pdfunite ' . implode(' ', $pages) . ' full.pdf');
function execute($command) {
print_section($command);
passthru($command);
}
function print_section($text) {
echo "\n\n$text\n\n";
}
function load($url) {
if ($contents = file_get_contents($url)) {
$document = @DOMDocument::loadHTML($contents);
if ($document) {
return simplexml_import_dom($document);
}
}
return false;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment