Skip to content

Instantly share code, notes, and snippets.

@andrewharvey
Created November 3, 2012 08:30
Show Gist options
  • Save andrewharvey/4006585 to your computer and use it in GitHub Desktop.
Save andrewharvey/4006585 to your computer and use it in GitHub Desktop.
WordPress eXtended RSS file to Chronicle converter
#!/usr/bin/perl -w
=head1 NAME
wordpress-export2chronicle - WordPress eXtended RSS file to Chronicle converter.
=cut
=head1 SYNOPSIS
wordpress-export2chronicle [options] <wordpress_export.xml> </path/to/new/blog_contents>
Options:
--postname Use the postname as the Chronicle post filename (otherwise will use the post_id).
--drafts Include drafts (they will be placed in a separate directory).
--categories-as-tags Treat categories as tags.
--download-attachments Download attachments.
--rewrite-attachments Rewrite all links to attachements as local resources.
For a fully featured conversion, it is recommended to use all the above options.
=cut
=head1 ABOUT
This script will convert a WordPress eXtended RSS file generated by WordPress into a Chronicle blog.
Although not yet a rock solid fully featured converted, it will certainly cerfice for a basic translation
process.
Current support includes:
* Blog configuration and metadata conversion
* Post content tags and title conversion
* Downloading and rewriting of attachements
Future features could include:
* Better conversion of the HTML content provided by the Wordpress (it is currently interpreted as Markdown syntax
which seems to format the paragraphs mostly correct, yet can still recognise the HTML elements like links)
* Comment conversion
* Non standard elements converted to HTML (like the [caption] syntax)
=cut
=head1 LICENSE
This script is licensed CC0 by Andrew Harvey <andrew.harvey4@gmail.com>
To the extent possible under law, the person who associated CC0
with this work has waived all copyright and related or neighboring
rights to this work.
http://creativecommons.org/publicdomain/zero/1.0/
=cut
use warnings;
use strict;
use feature qw(say);
use XML::Simple;
use Log::Log4perl qw(:easy);
use Getopt::Long qw(:config auto_help);
use URI;
use File::Path qw(make_path);
use LWP::UserAgent;
# use Log4perl and set verbosity to INFO. Options are DEBUG, INFO, WARN, ERROR, FATAL.
Log::Log4perl->easy_init($INFO);
# get arguments
my $use_post_name_as_file_name;
my $include_drafts;
my $treat_categories_as_tags;
my $download_attachments;
my $rewrite_attachments;
my $options = GetOptions ( "postname" => \$use_post_name_as_file_name,
"drafts" => \$include_drafts,
"categories-as-tags" => \$treat_categories_as_tags,
"download-attachments" => \$download_attachments,
"rewrite-attachments" => \$rewrite_attachments );
my $src_xml = $ARGV[0];
my $dst_dir = $ARGV[1];
# once options are filtered out check the non option arguments
if (@ARGV != 2) {
say "Usage: $0 [--postname] [--drafts] <wordpress_export.xml> </path/to/new/blog_contents>";
exit;
}
if (! -d $dst_dir ) {
say "Can't open $dst_dir. Are you sure it exists?";
}
my $ua;
if ($download_attachments) {
$ua = LWP::UserAgent->new;
}
# read in Wordpress export XML
my $wpxml = XMLin($ARGV[0], ForceArray => ['category']) or die;
# open Chronicle configuration file for writing
open my $config, ">$dst_dir/config.txt" or die;
# write out global Chronicle configuration
say $config "blog_title = " . $wpxml->{'channel'}->{'title'};
say $config "blog_subtitle = ". $wpxml->{'channel'}->{'description'};
say $config "url_prefix = " . $wpxml->{'channel'}->{'wp:base_blog_url'} . "/";
say $config "author = " . $wpxml->{'channel'}->{'wp:author'}->{'wp:author_email'};
say $config "input = publish";
# the content pulled from the Wordpress XML export is neither HTHL nor
# Markdown, but at least Markdown is the closest and produces okay results.
say $config "format = markdown";
say $config "pattern = *.md";
say "Chronicle config file written to $dst_dir/config.txt. You make want to tweak it.";
close $config;
# make a publish and draft directory within the content directory for posts
mkdir "$dst_dir/publish";
if ($include_drafts) {
mkdir "$dst_dir/draft";
}
# hash of attachment_url's to local resource
my %attachment_rewrites;
# for each item in the source XML document
for my $item ( @{$wpxml->{'channel'}->{'item'}} ) {
# sort out by post type
if ($item->{'wp:post_type'} eq "post") {
# sort out by status
if (($item->{'wp:status'} eq "publish") || ($include_drafts && $item->{'wp:status'} eq "draft")) {
my $link = $item->{'link'};
my $post_file_name = $item->{'wp:post_id'};
# use use the post_id as the post_file_name, unless we specifically want to use the post name
if ($use_post_name_as_file_name) {
if ($link =~ /\/([^\/]+)\/$/) {
$post_file_name = $1;
}else{
# we specifically asked to use the post title, but couldn't extract it
# just a warning because we fall back to using the post_id
# only warn if status == publish because otherwise this is expected
if ($item->{'wp:status'} eq "publish") {
WARN "Skipping item as we can't extract a post name or number from the link " . $item->{'link'};
}
}
}
my @tags;
my @categories;
if ( exists $item->{'category'} ) {
for my $category ( @{$item->{'category'}} ) {
if ($category->{'domain'} eq "category") {
push @categories, $category->{'content'};
}elsif ($category->{'domain'} eq "post_tag") {
push @tags, $category->{'content'};
}else{
WARN ("Unknown category domain: " . $category->{'domain'});
}
}
}
if ($treat_categories_as_tags) {
push @tags, @categories;
}
my $post_file_path = "$dst_dir/" . $item->{'wp:status'} . "/$post_file_name.md";
if ( -e $post_file_path ) {
WARN "$post_file_path already exists, not going to overwrite it.";
}else{
open (my $post_fh, ">$post_file_path") or ERROR $!;
binmode $post_fh, ":utf8";
print $post_fh "Title: " . $item->{'title'} . "\n";
print $post_fh "Date: " . $item->{'wp:post_date'} . "\n";
print $post_fh "Tags: " . join(',', @tags) . "\n";
print $post_fh "\n"; # blank line to separate headers and content
my $content = $item->{'content:encoded'};
# note this relies on the attachment appearing before its use in the Wordpress XML file
if ($rewrite_attachments) {
for my $attachment_url (keys %attachment_rewrites) {
$content =~ s/href=\"$attachment_url\"/href=\"$attachment_rewrites{$attachment_url}\"/g;
# FIXME since the capture group is optional, how to prevent the Use of uninitialized value warning?
$content =~ s/src=\"$attachment_url(?<arg>\?.*)?\"/src=\"$attachment_rewrites{$attachment_url}$+{arg}\"/g;
}
}
print $post_fh $content;
close $post_fh;
}
}else{
INFO "Ingoring post with status:" . $item->{'wp:status'};
}
}elsif ($item->{'wp:post_type'} eq "attachment") {
if ($download_attachments || $rewrite_attachments) {
my $attachment_url = $item->{'wp:attachment_url'};
# use URI for parsing the URL
my $uri = URI->new($attachment_url);
# parse the URI
my @path_segments = $uri->path_segments;
my $file_name = $path_segments[$#path_segments];
my @relative_dirs = @path_segments;
pop @relative_dirs; # take the file_name off the list
my $local_path = "attachments" . join('/', @relative_dirs);
if ($download_attachments) {
make_path("$dst_dir/$local_path");
say "GET $attachment_url into $dst_dir/$local_path/$file_name";
$ua->get($attachment_url, ':content_file' => "$dst_dir/$local_path/$file_name");
}
if ($rewrite_attachments) {
$attachment_rewrites{$attachment_url} = "/$local_path/$file_name";
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment