Skip to content

Instantly share code, notes, and snippets.

@stilliard
Created May 20, 2020 11:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save stilliard/b981fcefe70fc7ab2649eb624479cd97 to your computer and use it in GitHub Desktop.
Save stilliard/b981fcefe70fc7ab2649eb624479cd97 to your computer and use it in GitHub Desktop.
Scrape some blog posts and their images
<?php
//
// Quickly download some blog posts
//
// setup:
// mkdir -p images/post-{content,logos} # for images
// mkdir out # for the csv output
// composer init -q
// composer require voku/simple_html_dom
// composer require voku/portable-utf8
// composer require stilliard/csvparser
//
require_once __DIR__ . '/vendor/autoload.php';
use voku\helper\HtmlDomParser;
function dd(...$args)
{
var_dump(...$args);
exit;
}
function debugLog($title, $data = null)
{
echo "[{$title}] " . ($data ? json_encode($data) : '') . "\n";
}
function download($url)
{
return file_get_contents($url);
}
function downloadToFile($from, $to)
{
debugLog('downloading', ['from' => $from, 'to' => $to]);
file_put_contents(__DIR__ . '/' . $to, download($from));
}
function writeCsv($array, $file)
{
$parser = new \CsvParser\Parser();
$csv = $parser->fromArray($array);
var_dump($parser->toFile($csv, __DIR__ . '/' . $file));
}
function parseHtml($html)
{
return HtmlDomParser::str_get_html($html);
}
function crawlForPosts($url)
{
debugLog('crawl', $url);
$dom = parseHtml(download($url));
$data = findPosts($dom);
return $data;
}
function findPosts($dom)
{
$posts = $dom->find('.news_box');
if ($posts->count() == 0) {
debugLog('no found posts');
return null;
}
debugLog('found posts', $posts->count());
$data = [];
foreach ($posts as $post) {
$link = $post->findOne('.news_info h3 a');
debugLog('post', $link->href);
$image = $post->findOne('img')->src;
$imagePath = 'images/post-logos/' . basename($image);
downloadToFile(CRAWL_ORIGIN . $image, $imagePath);
$postData = parseHtml(download(CRAWL_ORIGIN . $link->href));
$postContent = $postData->find('.content-main');
$postDate = preg_replace('/[^\d\-]/', '', $postContent->findOne('.news_title span')->innertext);
foreach ($postData->find('.content-main img') as $contentImage) {
$contentImagePath = 'images/post-content/' . basename($contentImage->src);
downloadToFile(CRAWL_ORIGIN . $contentImage->src, $contentImagePath);
$contentImage->src = '/thumbnail/600x600/userfiles/' . $contentImagePath;
}
$data[] = (object) [
'link' => CRAWL_ORIGIN . $link->href,
'title' => $link->innertext,
'date' => $postDate,
'image' => '/userfiles/' . $imagePath,
'body' => (string) $postContent,
];
// return $data;
}
return $data;
}
define('CRAWL_ORIGIN', 'http://www.forgemotorsport.asia/');
$data = crawlForPosts(CRAWL_ORIGIN . 'news.php');
$data = array_merge($data, crawlForPosts(CRAWL_ORIGIN . 'news.php?page=2'));
$data = array_merge($data, crawlForPosts(CRAWL_ORIGIN . 'news.php?page=3'));
var_dump($data);
$blog = [];
$blog_blocks = [];
$i = 1;
foreach ($data as $post) {
$blog[] = [
'id' => $i,
'blog_title' => $post->title,
'date_added' => $post->date . ' 00:00:00',
'related_image' => $post->image,
'status' => 'live',
'related_id' => '0',
'is_private_blog' => '',
'allow_comments' => '',
'allow_guest_comments' => '',
'auto_approve_comments' => '',
'blog_tags' => '',
'category' => '0',
'user_id' => '1',
'email_when_comments_added' => '',
'extra_field_content_1' => '',
'extra_field_content_2' => '',
'featured' => '0',
'meta_title' => '',
'meta_keywords' => '',
'meta_description' => '',
];
$blog_blocks[] = [
'id' => $i,
'blog_id' => $i,
'type' => 'content-block',
'content' => $post->body,
'sort_order' => '1',
];
$i++;
}
writeCsv($blog, 'out/blog.csv');
writeCsv($blog_blocks, 'out/blog_blocks.csv');
debugLog('complete');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment