Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save karimkawambwa/caa5fde0a2ea64b380f1f23f1c2f0b01 to your computer and use it in GitHub Desktop.
Save karimkawambwa/caa5fde0a2ea64b380f1f23f1c2f0b01 to your computer and use it in GitHub Desktop.
Coastpress Migration Script
<?php
function post_exists_by_title($title) {
global $wpdb;
$query = $wpdb->prepare("SELECT ID FROM $wpdb->posts WHERE post_title = %s AND post_type = 'post' AND post_status = 'publish' LIMIT 1", $title);
return $wpdb->get_var($query);
}
function set_featured_image($post_id, $image_url) {
if (empty($image_url)) return;
$upload_dir = wp_upload_dir(); // WordPress upload directory
$image_data = file_get_contents($image_url);
$filename = urldecode(basename($image_url));
$file_path = $upload_dir['path'] . '/' . $filename;
$new_file = !file_exists($file_path);
// Save the new file if it doesn't exist
file_put_contents($file_path, $image_data);
if ($new_file) {
$wp_filetype = wp_check_filetype($filename, null);
$attachment = array(
'post_mime_type' => $wp_filetype['type'],
'post_title' => sanitize_file_name($filename),
'post_content' => '',
'post_status' => 'inherit'
);
$attach_id = wp_insert_attachment($attachment, $file_path, $post_id);
// Normally you would generate metadata and use wp_update_attachment_metadata() here
} else {
// File exists, find the attachment ID
$attach_id = attachment_url_to_postid($upload_dir['url'] . '/' . $filename);
}
// Set the found or new attachment as the featured image
set_post_thumbnail($post_id, $attach_id);
}
function fetch_full_post_content($url) {
if (!$url) return 'No content found';
// Initialize cURL session
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MyScraperBot/1.0; +http://example.com/bot)');
// Execute cURL session and close it
$html = curl_exec($ch);
curl_close($ch);
$doc = new DOMDocument();
libxml_use_internal_errors(true);
$doc->loadHTML($html);
libxml_clear_errors();
$xpath = new DOMXPath($doc);
$contentNode = $xpath->query("//div[contains(@class, 'post-body')]");
return $contentNode->item(0) ? $contentNode->item(0)->C14N() : 'No content found';
}
function fetch_posts($url) {
// Initialize cURL session
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MyScraperBot/1.0; +http://example.com/bot)');
// Execute cURL session and close it
$html = curl_exec($ch);
curl_close($ch);
$doc = new DOMDocument();
libxml_use_internal_errors(true); // Suppress libXML errors
$doc->loadHTML($html);
libxml_clear_errors();
$xpath = new DOMXPath($doc);
$postsQuery = "//div[contains(@class, 'post-outer')]";
$nextPageQuery = "//a[@title='More posts']";
foreach ($xpath->query($postsQuery) as $post) {
$titleNode = $xpath->query(".//h3[contains(@class, 'post-title')]/a", $post);
$title = $titleNode->item(0) ? $titleNode->item(0)->nodeValue : 'No title found';
$postUrl = $titleNode->item(0) ? $titleNode->item(0)->getAttribute('href') : null;
// Fetch full post content from the post's page
$fullPostContent = fetch_full_post_content($postUrl);
// Extract excerpt from snippet-item
$excerptNode = $xpath->query(".//div[contains(@class, 'snippet-item')]", $post);
$excerpt = $excerptNode->item(0) ? $excerptNode->item(0)->nodeValue : '';
$dateNode = $xpath->query(".//time[@class='published']", $post);
$date = $dateNode->item(0) ? $dateNode->item(0)->getAttribute('datetime') : date('Y-m-d H:i:s');
$post_data = array(
'post_title' => sanitize_text_field($title),
'post_content' => wp_kses_post($fullPostContent), // Use the full content
'post_status' => 'publish',
'post_author' => 1,
'post_date' => date('Y-m-d H:i:s', strtotime($date)),
'post_category' => array(1),
'post_excerpt' => sanitize_text_field($excerpt)
);
$existing_post_id = post_exists_by_title($post_data['post_title']);
if ($existing_post_id) {
// Post exists, update it
$post_data['ID'] = $existing_post_id; // Set the ID to update the existing post
$post_id = wp_update_post($post_data);
echo "Updated existing post: " . $post_data['post_title'] . " (ID: $post_id)<br>";
} else {
// Post does not exist, insert it
$post_id = wp_insert_post($post_data);
echo "Inserted new post: " . $post_data['post_title'] . " (ID: $post_id)<br>";
}
// Extract image URL from JSON-LD script
$scriptNodes = $xpath->query(".//script[@type='application/ld+json']", $post);
$imageURL = '';
foreach ($scriptNodes as $scriptNode) {
$jsonData = json_decode($scriptNode->nodeValue, true);
if (isset($jsonData['image']['url'])) {
$imageURL = $jsonData['image']['url'];
break;
}
}
// Set featured image
set_featured_image($post_id, $imageURL);
}
// Check for the next page
$nextPage = $xpath->query($nextPageQuery)->item(0);
if ($nextPage) {
$nextPageURL = $nextPage->getAttribute('href');
fetch_posts($nextPageURL); // Recursively call fetch_posts with the new URL
}
}
// Start fetching from the first page
fetch_posts('https://coastregionpressclub.blogspot.com/');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment