Created
January 5, 2010 21:16
-
-
Save robzienert/269740 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// This file was used to filter out tons of unwanted crap from WordPressMu's | |
// export XML file. Certainly not glamourous, but it worked flawlessly trimming | |
// out all of our company pages and spam comments, leaving just our blog. | |
// This made our migration from WordPress to Drupal substantially easier. | |
// | |
// http://purplerockscissors.com | |
// | |
$inFile = dirname(__FILE__) . '/main.wordpress.xml'; | |
$outFile = dirname(__FILE__) . '/main.wordpress.min.xml'; | |
// Just check if the files exist before moving forward | |
if (!file_exists($inFile) || !file_exists($outFile)) { | |
throw new Exception('One of the export/import files do not exist'); | |
} | |
// Setup the current DOMDocument from WordPress | |
$dom = new DOMDocument(); | |
$dom->formatOutput = true; | |
$dom->preserveWhiteSpace = false; | |
$dom->load($inFile); | |
// Setup the XPath namespaces that are in the WordPress document | |
$xpath = new DomXPath($dom); | |
$xpath->registerNamespace('excerpt', 'http://wordpress.org/export/1.0/excerpt/'); | |
$xpath->registerNamespace('content', 'http://purl.org/rss/1.0/modules/content/'); | |
$xpath->registerNamespace('wfw', 'http://wellformedweb.org/CommentAPI/'); | |
$xpath->registerNamespace('dc', 'http://purl.org/dc/elements/1.1/'); | |
$xpath->registerNamespace('wp', 'http://wordpress.org/export/1.0/'); | |
$removeNodes = array(); | |
// Look for all posts; searching by wp:post_type tag | |
$result = $xpath->query('//channel/item/wp:post_type'); | |
foreach ($result as $element) { | |
// If the post type is a "post"; go inside and look for comments | |
if ($element->nodeValue == 'post') { | |
$comments = $xpath->query('//channel/item/wp:comment/wp:comment_approved'); | |
foreach ($comments as $comment) { | |
// If a comment is marked as spam, go ahead and add it to the queue of removed nodes | |
if ($comment->nodeValue == 'spam') { | |
$removeNodes[] = $comment->parentNode; | |
} | |
} | |
continue; | |
} | |
// Add non-post nodes to the removal queue | |
$removeNodes[] = $element->parentNode; | |
} | |
// Remove all queued nodes | |
foreach ($removeNodes as $node) { | |
$parent = $node->parentNode; | |
// If the parent is not an object, then it just means the parent has already been removed | |
if (!is_object($parent)) { | |
continue; | |
} | |
$node->parentNode->removeChild($node); | |
} | |
// Save the new file | |
$dom->save($outFile); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment