Skip to content

Instantly share code, notes, and snippets.

@gubatron
Created December 13, 2012 02:46
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gubatron/4273601 to your computer and use it in GitHub Desktop.
Save gubatron/4273601 to your computer and use it in GitHub Desktop.
Console PHP Script to process thousands of Wordpress posts. Good examples of how to process posts using the wordpress api without exhausting the interpreter's memory. (turns off the default caching of the api) Examples of updating and adding post metadata (custom fields), setting the featured image of the post (working with galleries), and simpl…
<?php
/**
* Console PHP Script to process thousands of Wordpress posts.
*
* You can use this script as an example on how to:
* > extract urls and put them in the posts metadata.
* > download images referenced by the HTML out on flickr into the post
* > massaging the post HTML
*
* The script makes use of the wordpress api, in order to do this it turns off the
* default php caching that would otherwise eat all your memory as you load posts into memory.
* We make sure no posts are cached so we can process thousands of posts with very little memory.
*
* The wordpress api is one big mess full of bad practices that go against all the good
* principles of computer science and decent programming. Shame on you wordpress team.
*
* Gubatron.
* December 2012.
*/
error_reporting(E_ALL);
define('FIXED_DOUBLE_DOWNLOAD_URL', 1);
define('HAS_NO_DOWNLOAD_URL',2);
define('NO_FIX_NECESSARY',3);
define('IMAGE_FOLDER','/var/www/staging.frostclick.com/wp/wp-content/uploads/2012/12');
function getPostIds($mysqli,$tp) {
$q = "select ID from ". $tp ."posts where post_status='publish' order by post_date desc";
return mysqli_query($mysqli,$q);
}
function processPosts($post_ids) {
include_once('/var/www/staging.frostclick.com/wp/wp-load.php');
include_once('/var/www/staging.frostclick.com/wp/wp-admin/includes/media.php');
include_once('/var/www/staging.frostclick.com/wp/wp-admin/includes/file.php');
include_once('/var/www/staging.frostclick.com/wp/wp-admin/includes/image.php');
//turn off wordpress' default memory caching
//to avoid out of memory issues.
wp_suspend_cache_addition(true);
wp_suspend_cache_invalidation(true);
$numTotal = 0;
$numWithDownload = 0;
$numWithFeaturedImage = 0;
$numOutside = 0;
$postsWithNoDownloadField = array();
$numNoFeaturedImage = 0;
$numPostsEdited = 0;
while ($row = mysqli_fetch_row($post_ids)) {
$post = get_post($row[0],'OBJECT');
if (!$post || skipNonPost($post)) {
continue;
}
$numTotal++;
$custom = fixPost($post,$numWithDownload,$numWithFeaturedImage,$numNoFeaturedImage,$numOutside,$numPostsEdited);
if (!isset($custom['download']) || strlen($custom['download'][0]) === 0) {
echo "[".$custom['download'][0]."]!\n";
$postsWithNoDownloadField[] = "http://staging.frostclick.com/wp/wp-admin/post.php?post=".$post->ID."&action=edit&message=1";
}
}
echo "\n";
echo "Total Posts: $numTotal\n";
echo "Posts with 'download' field: $numWithDownload\n";
echo "Posts with featured image: $numWithFeaturedImage\n";
echo "Posts with Flickr image: $numOutside\n";
echo "Posts with HTML cleaned: $numPostsEdited\n";
echo count($postsWithNoDownloadField) . " posts with no 'download' field:\n";
foreach ($postsWithNoDownloadField as $missingURL) {
echo $missingURL . "\n";
}
}
function processPost($postId) {
include('/var/www/staging.frostclick.com/wp/wp-load.php');
wp_suspend_cache_addition(true);
wp_suspend_cache_invalidation(true);
$post = get_post($postId,'OBJECT');
fixPost($post,$numDownloads=0,$numWithFeaturedImage=0,$numOutside=0,$numNoFeaturedImage=0,$numPostsEdited=0);
}
function fixedDuppedDownloads($post,&$numWithDownload) {
$post_id = $post->ID;
$custom = get_post_custom($post_id);
if ($custom !== NULL && isset($custom['download'])) {
$numWithDownload++;
if (count($custom['download']) > 1) {
//echo "Fixing double download for ($post_id) [".$custom['download'][0] . "] [$post->post_title]\n";
delete_post_meta($post_id,'download');
add_post_meta($post_id,'download',$custom['download'][0]);
return array(FIXED_DOUBLE_DOWNLOAD_URL,get_post_custom($post_id));
} else if (stripos($custom['download'][0],'http')==0) {
return array(NO_FIX_NECESSARY,$custom);
}
}
return array(HAS_NO_DOWNLOAD_URL,-1);
}
function skipNonPost($post) {
return ($post->post_type==='gallery' ||
$post->post_type==='category' ||
$post->post_type==='nav_menu_item'||
$post->post_type==='page');
}
function fixPost($post,
&$numWithDownload,
&$numWithFeaturedImage,
&$numNoFeaturedImage,
&$numOutside,
&$numPostsEdited) {
$custom = fixDownloadURL($post,$numWithDownload);
removeDownloadAlbumTag($post);
fixFeaturedImage($post,$custom,$numWithFeaturedImage,$numOutside);
removeDownloadAlbumTags($post,$numPostsEdited);
return $custom;
} //fixPost
/** Returns the $posts custom meta array, or null if it couldn't extract an URL */
function fixDownloadURL($post,&$numWithDownload) {
$dupResult = fixedDuppedDownloads($post,$numWithDownload);
if ($dupResult[0]==HAS_NO_DOWNLOAD_URL) {
$downloadURL = extractDownloadURL(trim($post->post_content));
if ($downloadURL !== NULL) {
echo "(".$post->ID.") [$downloadURL]\n\n";
add_post_meta($post->ID,'download',$downloadURL);
$numWithDownload++;
return get_post_custom($post->ID);
} else {
echo "could not extract URL from post (".$post->ID."; type:".$post->post_type.")\n\n";
return null;
}
} else {
return $dupResult[1];
}
}
function extractDownloadURL($html) {
if (!$html || strlen($html)===0) {
return NULL;
}
$result = NULL;
$matches = array();
if (preg_match('<a class="torrent"(.*)href="(.*?)"(.*)>',$html,$matches)) {
$result = $matches[2];
} else if (preg_match('<a href="(.*?)"(.*)class="torrent"(.*)>',$html,$matches)) {
$result = $matches[1];
}
return ($result == NULL) ? NULL : trim($result);
} //extractDownloadURL
function fixFeaturedImage($post,$custom,&$numWithFeaturedImage,&$numOutside) {
//check if you already have featured image.
$post_id = $post->ID;
if ($custom !== NULL &&
isset($custom['_thumbnail_id']) &&
isset($custom['_thumbnail_id'][0]) &&
$custom['_thumbnail_id'][0]!=='0' &&
!empty($custom['_thumbnail_id'][0])) {
$numWithFeaturedImage++;
echo "Featured image is ok! (skipping fixFeatureImage)\n ";
return false;
}
$imageURL = extractImageURL($post->post_content);
if ($imageURL !== NULL) {
echo "Found image URL ($post_id): $imageURL\n";
$numWithFeaturedImage++;
//check if we already have an image in the gallery, this will be the featured image.
$children_parameters = array('post_parent' => $post_id,
'numberposts' => 1,
'post_mime_type'=> 'image',
'post_type' => 'attachment');
$gallery = get_children($children_parameters);
if(count($gallery)==0 && strpos($imageURL,'frostclick.com') == false) {
if (alreadyDownloaded($imageURL)) {
echo "\nAlready downloaded the image but got no gallery. (post: $post_id)\n";
echo "Getting attachment ID: \n";
} else if (downloadImageFromOutside($post, $imageURL, $numOutside)) {
$gallery = get_children($children_parameters);
echo "Downloaded image. Do we have a gallery? (".sizeof($gallery).")\n";
} else {
echo "\nERROR: Could not download $imageURL for post $post_id. No gallery.\n\n";
echo "====================================================================\n\n";
$gallery = array();
return false;
}
} else if (count($gallery) > 0) {
echo "Had a gallery already.\n";
}
$id = getAttachmentIDFromRemoteImageURL($imageURL);
$result = false;
if ($id !== 0) {
delete_post_meta($post_id,'_thumbnail_id');
add_post_meta($post_id,'_thumbnail_id', $id,true);
$result = true; //sorry dijkstra for the multiple returns.
} else if (count($gallery) > 0) {
foreach ($gallery as $attachment) {
if ($attachment->ID !== 0) {
delete_post_meta($post_id,'_thumbnail_id');
add_post_meta($post_id,'_thumbnail_id', $attachment->ID,true);
$result = true;
break;
}
}
} else {
echo "\nERROR: post $post_id had no image in the gallery, could not set featured image.\n\n.";
}
return $result;
}
}
function getAttachmentIDFromRemoteImageURL($image_src) {
global $wpdb;
$fileName = getFileNameFromURL($image_src);
$query = "SELECT post_id FROM {$wpdb->postmeta} WHERE meta_value LIKE '%$fileName' limit 1";
echo $query . "\n\n";
$post_id = $wpdb->get_var($query);
return intval($post_id);
}
function getFileNameFromURL($imageURL) {
return substr($imageURL,strrpos($imageURL,'/')+1);
}
function alreadyDownloaded($imageURL) {
return file_exists(IMAGE_FOLDER . '/' . getFileNameFromURL($imageURL));
}
/**
* downloads the imageURL and attaches it to the post.
*/
function downloadImageFromOutside($post, $imageURL, &$numOutside) {
//short circuit
echo "Downloading image from $imageURL...($post->ID : $post->post_title)\n";
$uploadResult = media_sideload_image(trim($imageURL),$post->ID,$post->post_title);
if (gettype($uploadResult)!=="string") {
return false;
}
echo $uploadResult . "\n\n"; //looks like an <img> tag.
return true;
}
function extractImageURL($html) {
if (!$html || strlen($html)===0) {
return NULL;
}
$result = NULL;
$matches = array();
//<img src="http://www.frostclick.com/wp/wp-content/uploads/2012/03/Kylep.jpg" alt="" title="Kylep" width="200" height="200" class="alignleft size-full wp-image-50647" />
if (preg_match('<img(.*)src="(.*?)"(.*)width="200" height="200"(.*)>',$html,$matches)) {
$result = $matches[2];
} else if (preg_match('<img(.*)src="(.*?)"(.*)height="200" width="200"(.*)>',$html,$matches)) {
$result = $matches[2];
} else if (preg_match('<img(.*)src="(.*?)"(.*)>',$html,$matches)) {
$result = $matches[2];
}
return $result;
}
function removeDownloadAlbumTags($post,&$numPostsEdited,$mysqli) {
/**
<a class="torrent" rel="nofollow" target="_blank" href="http://singingintheabbey.bandcamp.com/album/winter-demo-2011" >Download Album</a>
<a class="torrent" rel="nofollow" href="http://singingintheabbey.bandcamp.com/album/winter-demo-2011" target="_blank"><img src="http://www.frostclick.com/wp/wp-content/uploads/2011/04/1046820023-1-200x200.jpg" alt="" title="1046820023-1" width="200" height="200" class="alignleft size-thumbnail wp-image-25479" /></a>
*/
$matches = array();
preg_match_all('(<a.*class="torrent".*>(Download.*|<img .*/>)</a>)',$post->post_content,$matches);
if ($matches > 0) {
//echo "MATCHES:\n";
//var_dump($matches);
$subMatches = $matches[0];
//echo "\nSUB_MATCHES:\n";
//var_dump($subMatches);
//$maxIterations = 0;
foreach ($subMatches as $match) {
$output = str_replace($match,"",$post->post_content);
$post->post_content = $output;
//$maxIterations++;
//if ($maxIterations == 2) {
// break;
//}
}
//echo $post->post_content . "\n\n";
$updateSQL = "update wp2_posts set post_content = ? where ID = $post->ID limit 1";
$statement = $mysqli->prepare($updateSQL);
$statement->bind_param('s',$post->post_content);
if (!$statement->execute()) {
echo "! [$updateSQL]\n";
}
//echo "\n$updateSQL\n";
//var_dump($statement);
$statement->close();
$numPostsEdited++;
echo "Fixed $post->ID\n";
} else {
echo "No matches on post $post->ID\n";
}
}
function main() {
print "Starting post-fixer.php...\n";
//TABLE_PREFIX
$tp="wp2_";
$mysqli = new mysqli("localhost","username","password","wordpress_dbname") or die("No connection to mysql");
if ($mysqli->connect_errno) {
echo "Failed to connect to MySQL: " . $mysqli->connect_error;
exit(1);
}
$post_ids = getPostIds($mysqli,$tp);
processPosts($post_ids);
//processPost(25478);
print "Ended post-fixer.php.\n";
}
main();
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment