Skip to content

Instantly share code, notes, and snippets.

@dougalcampbell
Created March 1, 2016 17:21
Show Gist options
  • Save dougalcampbell/89d63c0848e0e3965711 to your computer and use it in GitHub Desktop.
Save dougalcampbell/89d63c0848e0e3965711 to your computer and use it in GitHub Desktop.
A perl script to split WordPress WXR export files into multiple, smaller files
#!/usr/bin/perl -w
#
# wxrsplit - Split a WordPress WXR file into multiple output files, each
# with a maximum filesize.
#
# NOTE: Because this tool attempts to keep items intact within each output
# file, it is possible to exceed the specified max filesize. Comments are
# contained within a post item, so a post with many comments could
# conceivably generate a very large item size. There probably is not a
# practical way around this.
#
# @author: Dougal Campbell <dougal@dougal.us>
# @license: MIT / GPL2
use strict;
use Getopt::Long;
## Defaults
my $filename = 'output.wxr';
# Can use 'K' for kilobytes or 'M' for megabytes (not case-sensitive)
my $size = '2M';
my $help = 0;
# If we get this many bytes into the file without finding proof that it's
# a WXR, abort. Since the entire WXR export of a fresh WP 2.5 install,
# including the 'Hello, World' post and comment, is only about 5K, a
# value of 8K should be more than safe:
my $hdrsz = '8192';
## Parse options
my $result = GetOptions('f=s' => \$filename, 's=s' => \$size, 'help' => \$help);
if ($help) {
die usage();
}
## Options valid?
checkfile($filename) or warn "File '$filename' not found.\n" and die usage();
my $filesize = parsesize($size) or warn "Filesize '$size' not recognized.\n" and die usage();
open WXR, "< $filename" or die "Could not open '$filename' for reading.\n";
my $header = getheader() or die "Could not parse header. Is this a WXR file?\n";
my $headersize = length($header);
# input record separator:
$/ = '<item>';
## Find the first item
seek WXR, $headersize, 0;
my $chunk = '';
my $chunksize = 0;
my $i = 1;
my $file = '';
while (<WXR>) {
chomp;
## first chunk is probably just whitespace between the channel info
## and the start of the first item. Skip it, if so:
next unless m|</item>|s;
my $item = "<item>\n" . $_;
my $itemsize = length($item);
($file = $filename) =~ s/\.wxr/-$i.wxr/;
if (($headersize + $chunksize + $itemsize) >= $filesize) {
writechunk($header . $chunk, $file);
$chunk = $item;
++$i;
}
$chunk .= $item;
$chunksize = length($chunk);
}
## Write final chunk.
writechunk($header . $chunk, $file) if $chunk;
print "Done.\n";
######
sub usage {
warn <<USAGE;
Usage: wxrsplit [opts]
Options:
-f filename (defaults to 'output.wxr')
-s SIZE (defaults to 2M)
Split a WXR file into multiple pieces, keeping each piece below a given
size.
USAGE
return "\n"; # suppress line number reporting from die()
}
sub checkfile {
my $file = shift;
if (-f $file) {
return $file;
}
return undef;
}
sub parsesize {
my $size = shift;
my $kilo = 1024;
my $meg = $kilo * $kilo;
my $mult = 1;
$size =~ m/^(\d+)(.?)$/;
my ($num, $unit) = ($1, $2);
$num -= 0 or return undef;
$unit = lc($unit);
if ($unit eq 'm') {
$mult = $meg;
} elsif ($unit eq 'k') {
$mult = $kilo;
} elsif ($unit) {
return undef;
}
my $filesize = $num * $mult;
return $filesize;
}
sub getheader {
my $bytes = 0;
my $header = '';
$bytes = read(WXR, $header, $hdrsz);
## Is this really a WXR file?
my ($iswxr) = $header =~ m|xmlns:wp="http://wordpress[.]org/export/\d+[.]\d+/"|s;
return undef unless $iswxr;
## Non-greedy match of everything up to the first <item> element:
$header =~ m|^(.*?)<item>|s;
## rewind file
seek WXR, 0, 0;
return $1 ? $1 : undef;
}
sub writechunk {
my ($text, $outfile) = @_;
open OUT, "> $outfile";
print OUT $text;
close OUT;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment