Last active
March 31, 2016 03:00
-
-
Save karlgray/c3ab17615b3c0f712cb4144a4734c25b to your computer and use it in GitHub Desktop.
Quick hack to extract wordpress posts from xml to separate html files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/php -q | |
<?php | |
/* Really nasty, really fast hack to extract all wordpress articles to .html files. | |
* Please test before using. It worked for a particular site to save time copy and pasting. | |
* There is no garantee it will work for you. | |
* | |
* Note this a php CLI script don't run from browser. | |
* | |
* I then used this to convert them to markdown files. | |
* find ./ -iname "*.html" -type f -exec sh -c 'pandoc "${0}" -o "./markdown/$(basename ${0%.html}.md)"' {} \; | |
* | |
*/ | |
$filename = "mycms.wordpress.2016-01-27.xml"; | |
$file=file_get_contents ($filename); | |
$xml=simplexml_load_string($file); | |
foreach ($xml->channel->item as $item) { | |
$filename = $item->title; | |
# This section simply removes any part of the title that would make for a bad filename | |
# Edit this section to suite yourself. | |
$filename = str_replace ( "a) ", "", $filename ); | |
$filename = str_replace ( "b) ", "", $filename ); | |
$filename = str_replace ( "c) ", "", $filename ); | |
$filename = str_replace ( "d) ", "", $filename ); | |
$filename = str_replace ( "e) ", "", $filename ); | |
$filename = str_replace ( "f) ", "", $filename ); | |
$filename = str_replace ( "g) ", "", $filename ); | |
$filename = str_replace ( "h) ", "", $filename ); | |
# Replace / with - for example /categoryname/postname to category-postname | |
$filename = str_replace ( "/", "-", $filename ); | |
# Replace spaces, shouldn't happen but.... | |
$filename = str_replace ( " ", "-", $filename ); | |
# remove special chars from filename | |
$special_chars = array("/", "(", ")", ",", ";", ":", "'", "."); | |
$filename = str_replace ( $special_chars, "", $filename ); | |
# I had a couple of files with --- in them. | |
$filename = str_replace ( "---", "-", $filename ); | |
# I like my filenames all lower case | |
$filename = strtolower($filename); | |
# Create and write file | |
$content = $item->children("content", true); | |
$content = (string)$content->encoded; | |
$filename = $filename.".html"; | |
echo "writing $filename".".html" ."\n"; | |
file_put_contents($filename, $content); | |
} | |
?> | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment