Created
November 3, 2012 08:30
-
-
Save andrewharvey/4006585 to your computer and use it in GitHub Desktop.
WordPress eXtended RSS file to Chronicle converter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
=head1 NAME | |
wordpress-export2chronicle - WordPress eXtended RSS file to Chronicle converter. | |
=cut | |
=head1 SYNOPSIS | |
wordpress-export2chronicle [options] <wordpress_export.xml> </path/to/new/blog_contents> | |
Options: | |
--postname Use the postname as the Chronicle post filename (otherwise will use the post_id). | |
--drafts Include drafts (they will be placed in a separate directory). | |
--categories-as-tags Treat categories as tags. | |
--download-attachments Download attachments. | |
--rewrite-attachments Rewrite all links to attachements as local resources. | |
For a fully featured conversion, it is recommended to use all the above options. | |
=cut | |
=head1 ABOUT | |
This script will convert a WordPress eXtended RSS file generated by WordPress into a Chronicle blog. | |
Although not yet a rock solid fully featured converted, it will certainly cerfice for a basic translation | |
process. | |
Current support includes: | |
* Blog configuration and metadata conversion | |
* Post content tags and title conversion | |
* Downloading and rewriting of attachements | |
Future features could include: | |
* Better conversion of the HTML content provided by the Wordpress (it is currently interpreted as Markdown syntax | |
which seems to format the paragraphs mostly correct, yet can still recognise the HTML elements like links) | |
* Comment conversion | |
* Non standard elements converted to HTML (like the [caption] syntax) | |
=cut | |
=head1 LICENSE | |
This script is licensed CC0 by Andrew Harvey <andrew.harvey4@gmail.com> | |
To the extent possible under law, the person who associated CC0 | |
with this work has waived all copyright and related or neighboring | |
rights to this work. | |
http://creativecommons.org/publicdomain/zero/1.0/ | |
=cut | |
use warnings; | |
use strict; | |
use feature qw(say); | |
use XML::Simple; | |
use Log::Log4perl qw(:easy); | |
use Getopt::Long qw(:config auto_help); | |
use URI; | |
use File::Path qw(make_path); | |
use LWP::UserAgent; | |
# use Log4perl and set verbosity to INFO. Options are DEBUG, INFO, WARN, ERROR, FATAL. | |
Log::Log4perl->easy_init($INFO); | |
# get arguments | |
my $use_post_name_as_file_name; | |
my $include_drafts; | |
my $treat_categories_as_tags; | |
my $download_attachments; | |
my $rewrite_attachments; | |
my $options = GetOptions ( "postname" => \$use_post_name_as_file_name, | |
"drafts" => \$include_drafts, | |
"categories-as-tags" => \$treat_categories_as_tags, | |
"download-attachments" => \$download_attachments, | |
"rewrite-attachments" => \$rewrite_attachments ); | |
my $src_xml = $ARGV[0]; | |
my $dst_dir = $ARGV[1]; | |
# once options are filtered out check the non option arguments | |
if (@ARGV != 2) { | |
say "Usage: $0 [--postname] [--drafts] <wordpress_export.xml> </path/to/new/blog_contents>"; | |
exit; | |
} | |
if (! -d $dst_dir ) { | |
say "Can't open $dst_dir. Are you sure it exists?"; | |
} | |
my $ua; | |
if ($download_attachments) { | |
$ua = LWP::UserAgent->new; | |
} | |
# read in Wordpress export XML | |
my $wpxml = XMLin($ARGV[0], ForceArray => ['category']) or die; | |
# open Chronicle configuration file for writing | |
open my $config, ">$dst_dir/config.txt" or die; | |
# write out global Chronicle configuration | |
say $config "blog_title = " . $wpxml->{'channel'}->{'title'}; | |
say $config "blog_subtitle = ". $wpxml->{'channel'}->{'description'}; | |
say $config "url_prefix = " . $wpxml->{'channel'}->{'wp:base_blog_url'} . "/"; | |
say $config "author = " . $wpxml->{'channel'}->{'wp:author'}->{'wp:author_email'}; | |
say $config "input = publish"; | |
# the content pulled from the Wordpress XML export is neither HTHL nor | |
# Markdown, but at least Markdown is the closest and produces okay results. | |
say $config "format = markdown"; | |
say $config "pattern = *.md"; | |
say "Chronicle config file written to $dst_dir/config.txt. You make want to tweak it."; | |
close $config; | |
# make a publish and draft directory within the content directory for posts | |
mkdir "$dst_dir/publish"; | |
if ($include_drafts) { | |
mkdir "$dst_dir/draft"; | |
} | |
# hash of attachment_url's to local resource | |
my %attachment_rewrites; | |
# for each item in the source XML document | |
for my $item ( @{$wpxml->{'channel'}->{'item'}} ) { | |
# sort out by post type | |
if ($item->{'wp:post_type'} eq "post") { | |
# sort out by status | |
if (($item->{'wp:status'} eq "publish") || ($include_drafts && $item->{'wp:status'} eq "draft")) { | |
my $link = $item->{'link'}; | |
my $post_file_name = $item->{'wp:post_id'}; | |
# use use the post_id as the post_file_name, unless we specifically want to use the post name | |
if ($use_post_name_as_file_name) { | |
if ($link =~ /\/([^\/]+)\/$/) { | |
$post_file_name = $1; | |
}else{ | |
# we specifically asked to use the post title, but couldn't extract it | |
# just a warning because we fall back to using the post_id | |
# only warn if status == publish because otherwise this is expected | |
if ($item->{'wp:status'} eq "publish") { | |
WARN "Skipping item as we can't extract a post name or number from the link " . $item->{'link'}; | |
} | |
} | |
} | |
my @tags; | |
my @categories; | |
if ( exists $item->{'category'} ) { | |
for my $category ( @{$item->{'category'}} ) { | |
if ($category->{'domain'} eq "category") { | |
push @categories, $category->{'content'}; | |
}elsif ($category->{'domain'} eq "post_tag") { | |
push @tags, $category->{'content'}; | |
}else{ | |
WARN ("Unknown category domain: " . $category->{'domain'}); | |
} | |
} | |
} | |
if ($treat_categories_as_tags) { | |
push @tags, @categories; | |
} | |
my $post_file_path = "$dst_dir/" . $item->{'wp:status'} . "/$post_file_name.md"; | |
if ( -e $post_file_path ) { | |
WARN "$post_file_path already exists, not going to overwrite it."; | |
}else{ | |
open (my $post_fh, ">$post_file_path") or ERROR $!; | |
binmode $post_fh, ":utf8"; | |
print $post_fh "Title: " . $item->{'title'} . "\n"; | |
print $post_fh "Date: " . $item->{'wp:post_date'} . "\n"; | |
print $post_fh "Tags: " . join(',', @tags) . "\n"; | |
print $post_fh "\n"; # blank line to separate headers and content | |
my $content = $item->{'content:encoded'}; | |
# note this relies on the attachment appearing before its use in the Wordpress XML file | |
if ($rewrite_attachments) { | |
for my $attachment_url (keys %attachment_rewrites) { | |
$content =~ s/href=\"$attachment_url\"/href=\"$attachment_rewrites{$attachment_url}\"/g; | |
# FIXME since the capture group is optional, how to prevent the Use of uninitialized value warning? | |
$content =~ s/src=\"$attachment_url(?<arg>\?.*)?\"/src=\"$attachment_rewrites{$attachment_url}$+{arg}\"/g; | |
} | |
} | |
print $post_fh $content; | |
close $post_fh; | |
} | |
}else{ | |
INFO "Ingoring post with status:" . $item->{'wp:status'}; | |
} | |
}elsif ($item->{'wp:post_type'} eq "attachment") { | |
if ($download_attachments || $rewrite_attachments) { | |
my $attachment_url = $item->{'wp:attachment_url'}; | |
# use URI for parsing the URL | |
my $uri = URI->new($attachment_url); | |
# parse the URI | |
my @path_segments = $uri->path_segments; | |
my $file_name = $path_segments[$#path_segments]; | |
my @relative_dirs = @path_segments; | |
pop @relative_dirs; # take the file_name off the list | |
my $local_path = "attachments" . join('/', @relative_dirs); | |
if ($download_attachments) { | |
make_path("$dst_dir/$local_path"); | |
say "GET $attachment_url into $dst_dir/$local_path/$file_name"; | |
$ua->get($attachment_url, ':content_file' => "$dst_dir/$local_path/$file_name"); | |
} | |
if ($rewrite_attachments) { | |
$attachment_rewrites{$attachment_url} = "/$local_path/$file_name"; | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment