Skip to content

Instantly share code, notes, and snippets.

@robzienert
Created January 5, 2010 21:16
Show Gist options
  • Save robzienert/269740 to your computer and use it in GitHub Desktop.
Save robzienert/269740 to your computer and use it in GitHub Desktop.
<?php
// This file was used to filter out tons of unwanted crap from WordPressMu's
// export XML file. Certainly not glamourous, but it worked flawlessly trimming
// out all of our company pages and spam comments, leaving just our blog.
// This made our migration from WordPress to Drupal substantially easier.
//
// http://purplerockscissors.com
//
$inFile = dirname(__FILE__) . '/main.wordpress.xml';
$outFile = dirname(__FILE__) . '/main.wordpress.min.xml';
// Just check if the files exist before moving forward
if (!file_exists($inFile) || !file_exists($outFile)) {
throw new Exception('One of the export/import files do not exist');
}
// Setup the current DOMDocument from WordPress
$dom = new DOMDocument();
$dom->formatOutput = true;
$dom->preserveWhiteSpace = false;
$dom->load($inFile);
// Setup the XPath namespaces that are in the WordPress document
$xpath = new DomXPath($dom);
$xpath->registerNamespace('excerpt', 'http://wordpress.org/export/1.0/excerpt/');
$xpath->registerNamespace('content', 'http://purl.org/rss/1.0/modules/content/');
$xpath->registerNamespace('wfw', 'http://wellformedweb.org/CommentAPI/');
$xpath->registerNamespace('dc', 'http://purl.org/dc/elements/1.1/');
$xpath->registerNamespace('wp', 'http://wordpress.org/export/1.0/');
$removeNodes = array();
// Look for all posts; searching by wp:post_type tag
$result = $xpath->query('//channel/item/wp:post_type');
foreach ($result as $element) {
// If the post type is a "post"; go inside and look for comments
if ($element->nodeValue == 'post') {
$comments = $xpath->query('//channel/item/wp:comment/wp:comment_approved');
foreach ($comments as $comment) {
// If a comment is marked as spam, go ahead and add it to the queue of removed nodes
if ($comment->nodeValue == 'spam') {
$removeNodes[] = $comment->parentNode;
}
}
continue;
}
// Add non-post nodes to the removal queue
$removeNodes[] = $element->parentNode;
}
// Remove all queued nodes
foreach ($removeNodes as $node) {
$parent = $node->parentNode;
// If the parent is not an object, then it just means the parent has already been removed
if (!is_object($parent)) {
continue;
}
$node->parentNode->removeChild($node);
}
// Save the new file
$dom->save($outFile);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment