stilliard/scrape.php

## scrape.php
<?php

//
// Quickly download some blog posts
//
// setup:
// mkdir -p images/post-{content,logos} # for images
// mkdir out # for the csv output
// composer init -q
// composer require voku/simple_html_dom
// composer require voku/portable-utf8
// composer require stilliard/csvparser
//

require_once __DIR__ . '/vendor/autoload.php';

use voku\helper\HtmlDomParser;

function dd(...$args)
{
    var_dump(...$args);
    exit;
}

function debugLog($title, $data = null)
{
    echo "[{$title}] " . ($data ? json_encode($data) : '') . "\n";
}

function download($url)
{
    return file_get_contents($url);
}

function downloadToFile($from, $to)
{
    debugLog('downloading', ['from' => $from, 'to' => $to]);
    file_put_contents(__DIR__ . '/' . $to, download($from));
}

function writeCsv($array, $file)
{
    $parser = new \CsvParser\Parser();
    $csv = $parser->fromArray($array);
    var_dump($parser->toFile($csv, __DIR__ . '/' . $file));
}

function parseHtml($html)
{
    return HtmlDomParser::str_get_html($html);
}

function crawlForPosts($url)
{
    debugLog('crawl', $url);
    $dom = parseHtml(download($url));

    $data = findPosts($dom);

    return $data;
}

function findPosts($dom)
{
    $posts = $dom->find('.news_box');
    if ($posts->count() == 0) {
        debugLog('no found posts');
        return null;
    }

    debugLog('found posts', $posts->count());

    $data = [];
    foreach ($posts as $post) {
        $link = $post->findOne('.news_info h3 a');
        debugLog('post', $link->href);

        $image = $post->findOne('img')->src;
        $imagePath = 'images/post-logos/' . basename($image);
        downloadToFile(CRAWL_ORIGIN . $image, $imagePath);

        $postData = parseHtml(download(CRAWL_ORIGIN . $link->href));
        $postContent = $postData->find('.content-main');
        $postDate = preg_replace('/[^\d\-]/', '', $postContent->findOne('.news_title span')->innertext);

        foreach ($postData->find('.content-main img') as $contentImage) {
            $contentImagePath = 'images/post-content/' . basename($contentImage->src);
            downloadToFile(CRAWL_ORIGIN . $contentImage->src, $contentImagePath);
            $contentImage->src = '/thumbnail/600x600/userfiles/' . $contentImagePath;
        }

        $data[] = (object) [
            'link' => CRAWL_ORIGIN . $link->href,
            'title' => $link->innertext,
            'date' => $postDate,
            'image' => '/userfiles/' . $imagePath,
            'body' => (string) $postContent,
        ];

        // return $data;
    }

    return $data;
}

define('CRAWL_ORIGIN', 'http://www.forgemotorsport.asia/');

$data = crawlForPosts(CRAWL_ORIGIN . 'news.php');
$data = array_merge($data, crawlForPosts(CRAWL_ORIGIN . 'news.php?page=2'));
$data = array_merge($data, crawlForPosts(CRAWL_ORIGIN . 'news.php?page=3'));

var_dump($data);

$blog = [];
$blog_blocks = [];
$i = 1;
foreach ($data as $post) {
    $blog[] = [
        'id' => $i,
        'blog_title' => $post->title,
        'date_added' => $post->date . ' 00:00:00',
        'related_image' => $post->image,
        'status' => 'live',
        'related_id' => '0',
        'is_private_blog' => '',
        'allow_comments' => '',
        'allow_guest_comments' => '',
        'auto_approve_comments' => '',
        'blog_tags' => '',
        'category' => '0',
        'user_id' => '1',
        'email_when_comments_added' => '',
        'extra_field_content_1' => '',
        'extra_field_content_2' => '',
        'featured' => '0',
        'meta_title' => '',
        'meta_keywords' => '',
        'meta_description' => '',
    ];
    $blog_blocks[] = [
        'id' => $i,
        'blog_id' => $i,
        'type' => 'content-block',
        'content' => $post->body,
        'sort_order' => '1',
    ];

    $i++;
}

writeCsv($blog, 'out/blog.csv');
writeCsv($blog_blocks, 'out/blog_blocks.csv');

debugLog('complete');
	<?php

	//
	// Quickly download some blog posts
	//
	// setup:
	// mkdir -p images/post-{content,logos} # for images
	// mkdir out # for the csv output
	// composer init -q
	// composer require voku/simple_html_dom
	// composer require voku/portable-utf8
	// composer require stilliard/csvparser
	//

	require_once __DIR__ . '/vendor/autoload.php';

	use voku\helper\HtmlDomParser;

	function dd(...$args)
	{
	var_dump(...$args);
	exit;
	}

	function debugLog($title, $data = null)
	{
	echo "[{$title}] " . ($data ? json_encode($data) : '') . "\n";
	}

	function download($url)
	{
	return file_get_contents($url);
	}

	function downloadToFile($from, $to)
	{
	debugLog('downloading', ['from' => $from, 'to' => $to]);
	file_put_contents(__DIR__ . '/' . $to, download($from));
	}

	function writeCsv($array, $file)
	{
	$parser = new \CsvParser\Parser();
	$csv = $parser->fromArray($array);
	var_dump($parser->toFile($csv, __DIR__ . '/' . $file));
	}

	function parseHtml($html)
	{
	return HtmlDomParser::str_get_html($html);
	}

	function crawlForPosts($url)
	{
	debugLog('crawl', $url);
	$dom = parseHtml(download($url));

	$data = findPosts($dom);

	return $data;
	}

	function findPosts($dom)
	{
	$posts = $dom->find('.news_box');
	if ($posts->count() == 0) {
	debugLog('no found posts');
	return null;
	}

	debugLog('found posts', $posts->count());

	$data = [];
	foreach ($posts as $post) {
	$link = $post->findOne('.news_info h3 a');
	debugLog('post', $link->href);

	$image = $post->findOne('img')->src;
	$imagePath = 'images/post-logos/' . basename($image);
	downloadToFile(CRAWL_ORIGIN . $image, $imagePath);

	$postData = parseHtml(download(CRAWL_ORIGIN . $link->href));
	$postContent = $postData->find('.content-main');
	$postDate = preg_replace('/[^\d\-]/', '', $postContent->findOne('.news_title span')->innertext);

	foreach ($postData->find('.content-main img') as $contentImage) {
	$contentImagePath = 'images/post-content/' . basename($contentImage->src);
	downloadToFile(CRAWL_ORIGIN . $contentImage->src, $contentImagePath);
	$contentImage->src = '/thumbnail/600x600/userfiles/' . $contentImagePath;
	}

	$data[] = (object) [
	'link' => CRAWL_ORIGIN . $link->href,
	'title' => $link->innertext,
	'date' => $postDate,
	'image' => '/userfiles/' . $imagePath,
	'body' => (string) $postContent,
	];

	// return $data;
	}

	return $data;
	}

	define('CRAWL_ORIGIN', 'http://www.forgemotorsport.asia/');

	$data = crawlForPosts(CRAWL_ORIGIN . 'news.php');
	$data = array_merge($data, crawlForPosts(CRAWL_ORIGIN . 'news.php?page=2'));
	$data = array_merge($data, crawlForPosts(CRAWL_ORIGIN . 'news.php?page=3'));

	var_dump($data);

	$blog = [];
	$blog_blocks = [];
	$i = 1;
	foreach ($data as $post) {
	$blog[] = [
	'id' => $i,
	'blog_title' => $post->title,
	'date_added' => $post->date . ' 00:00:00',
	'related_image' => $post->image,
	'status' => 'live',
	'related_id' => '0',
	'is_private_blog' => '',
	'allow_comments' => '',
	'allow_guest_comments' => '',
	'auto_approve_comments' => '',
	'blog_tags' => '',
	'category' => '0',
	'user_id' => '1',
	'email_when_comments_added' => '',
	'extra_field_content_1' => '',
	'extra_field_content_2' => '',
	'featured' => '0',
	'meta_title' => '',
	'meta_keywords' => '',
	'meta_description' => '',
	];
	$blog_blocks[] = [
	'id' => $i,
	'blog_id' => $i,
	'type' => 'content-block',
	'content' => $post->body,
	'sort_order' => '1',
	];

	$i++;
	}

	writeCsv($blog, 'out/blog.csv');
	writeCsv($blog_blocks, 'out/blog_blocks.csv');

	debugLog('complete');