dougalcampbell/wxrsplit.pl

## wxrsplit.pl
#!/usr/bin/perl -w
#
# wxrsplit - Split a WordPress WXR file into multiple output files, each
#   with a maximum filesize.
#
# NOTE: Because this tool attempts to keep items intact within each output
#   file, it is possible to exceed the specified max filesize. Comments are
#   contained within a post item, so a post with many comments could
#   conceivably generate a very large item size. There probably is not a
#   practical way around this.
#
# @author: Dougal Campbell <dougal@dougal.us>
# @license: MIT / GPL2

use strict;
use Getopt::Long;

## Defaults
my $filename = 'output.wxr';
# Can use 'K' for kilobytes or 'M' for megabytes (not case-sensitive)
my $size = '2M';

my $help = 0;

# If we get this many bytes into the file without finding proof that it's
# a WXR, abort. Since the entire WXR export of a fresh WP 2.5 install,
# including the 'Hello, World' post and comment, is only about 5K, a
# value of 8K should be more than safe:
my $hdrsz = '8192';

## Parse options
my $result = GetOptions('f=s' => \$filename, 's=s' => \$size, 'help' => \$help);

if ($help) {
	die usage();
}

## Options valid?
checkfile($filename) or warn "File '$filename' not found.\n" and die usage();
my $filesize = parsesize($size) or warn "Filesize '$size' not recognized.\n" and die usage();

open WXR, "< $filename" or die "Could not open '$filename' for reading.\n";

my $header = getheader() or die "Could not parse header. Is this a WXR file?\n";

my $headersize = length($header);

# input record separator:
$/ = '<item>';

## Find the first item
seek WXR, $headersize, 0;

my $chunk = '';
my $chunksize = 0;
my $i = 1;
my $file = '';

while (<WXR>) {
	chomp;

	## first chunk is probably just whitespace between the channel info
	## and the start of the first item. Skip it, if so:
	next unless m|</item>|s;

	my $item = "<item>\n" . $_;
	my $itemsize = length($item);

	($file = $filename) =~ s/\.wxr/-$i.wxr/;

	if (($headersize + $chunksize + $itemsize) >= $filesize) {

		writechunk($header . $chunk, $file);

		$chunk = $item;
		++$i;
	}

	$chunk .= $item;
	$chunksize = length($chunk);

}

## Write final chunk.
writechunk($header . $chunk, $file) if $chunk;

print "Done.\n";

######
sub usage {
	warn <<USAGE;

Usage: wxrsplit [opts]
  Options:
    -f filename (defaults to 'output.wxr')
    -s SIZE (defaults to 2M)

Split a WXR file into multiple pieces, keeping each piece below a given
size.
USAGE
return "\n"; # suppress line number reporting from die()
}

sub checkfile {
	my $file = shift;

	if (-f $file) {
		return $file;
	}

	return undef;
}

sub parsesize {
	my $size = shift;
	my $kilo = 1024;
	my $meg = $kilo * $kilo;
	my $mult = 1;

	$size =~ m/^(\d+)(.?)$/;

	my ($num, $unit) = ($1, $2);

	$num -= 0 or return undef;

	$unit = lc($unit);

	if ($unit eq 'm') {
		$mult = $meg;
	} elsif ($unit eq 'k') {
		$mult = $kilo;
	} elsif ($unit) {
		return undef;
	}

	my $filesize = $num * $mult;

	return $filesize;
}

sub getheader {
	my $bytes = 0;
	my $header = '';

	$bytes = read(WXR, $header, $hdrsz);

	## Is this really a WXR file?
	my ($iswxr) = $header =~ m|xmlns:wp="http://wordpress[.]org/export/\d+[.]\d+/"|s;

	return undef unless $iswxr;

	## Non-greedy match of everything up to the first <item> element:
	$header =~ m|^(.*?)<item>|s;

	## rewind file
	seek WXR, 0, 0;

	return $1 ? $1 : undef;
}

sub writechunk {
	my ($text, $outfile) = @_;

	open OUT, "> $outfile";
	print OUT $text;
	close OUT;

}
	#!/usr/bin/perl -w
	#
	# wxrsplit - Split a WordPress WXR file into multiple output files, each
	# with a maximum filesize.
	#
	# NOTE: Because this tool attempts to keep items intact within each output
	# file, it is possible to exceed the specified max filesize. Comments are
	# contained within a post item, so a post with many comments could
	# conceivably generate a very large item size. There probably is not a
	# practical way around this.
	#
	# @author: Dougal Campbell <dougal@dougal.us>
	# @license: MIT / GPL2

	use strict;
	use Getopt::Long;

	## Defaults
	my $filename = 'output.wxr';
	# Can use 'K' for kilobytes or 'M' for megabytes (not case-sensitive)
	my $size = '2M';

	my $help = 0;

	# If we get this many bytes into the file without finding proof that it's
	# a WXR, abort. Since the entire WXR export of a fresh WP 2.5 install,
	# including the 'Hello, World' post and comment, is only about 5K, a
	# value of 8K should be more than safe:
	my $hdrsz = '8192';

	## Parse options
	my $result = GetOptions('f=s' => \$filename, 's=s' => \$size, 'help' => \$help);

	if ($help) {
	die usage();
	}

	## Options valid?
	checkfile($filename) or warn "File '$filename' not found.\n" and die usage();
	my $filesize = parsesize($size) or warn "Filesize '$size' not recognized.\n" and die usage();

	open WXR, "< $filename" or die "Could not open '$filename' for reading.\n";

	my $header = getheader() or die "Could not parse header. Is this a WXR file?\n";

	my $headersize = length($header);

	# input record separator:
	$/ = '<item>';

	## Find the first item
	seek WXR, $headersize, 0;

	my $chunk = '';
	my $chunksize = 0;
	my $i = 1;
	my $file = '';

	while (<WXR>) {
	chomp;

	## first chunk is probably just whitespace between the channel info
	## and the start of the first item. Skip it, if so:
	next unless m\|</item>\|s;

	my $item = "<item>\n" . $_;
	my $itemsize = length($item);

	($file = $filename) =~ s/\.wxr/-$i.wxr/;

	if (($headersize + $chunksize + $itemsize) >= $filesize) {

	writechunk($header . $chunk, $file);

	$chunk = $item;
	++$i;
	}

	$chunk .= $item;
	$chunksize = length($chunk);

	}

	## Write final chunk.
	writechunk($header . $chunk, $file) if $chunk;

	print "Done.\n";

	######
	sub usage {
	warn <<USAGE;

	Usage: wxrsplit [opts]
	Options:
	-f filename (defaults to 'output.wxr')
	-s SIZE (defaults to 2M)

	Split a WXR file into multiple pieces, keeping each piece below a given
	size.
	USAGE
	return "\n"; # suppress line number reporting from die()
	}

	sub checkfile {
	my $file = shift;

	if (-f $file) {
	return $file;
	}

	return undef;
	}

	sub parsesize {
	my $size = shift;
	my $kilo = 1024;
	my $meg = $kilo * $kilo;
	my $mult = 1;

	$size =~ m/^(\d+)(.?)$/;

	my ($num, $unit) = ($1, $2);

	$num -= 0 or return undef;

	$unit = lc($unit);

	if ($unit eq 'm') {
	$mult = $meg;
	} elsif ($unit eq 'k') {
	$mult = $kilo;
	} elsif ($unit) {
	return undef;
	}

	my $filesize = $num * $mult;

	return $filesize;
	}

	sub getheader {
	my $bytes = 0;
	my $header = '';

	$bytes = read(WXR, $header, $hdrsz);

	## Is this really a WXR file?
	my ($iswxr) = $header =~ m\|xmlns:wp="http://wordpress[.]org/export/\d+[.]\d+/"\|s;

	return undef unless $iswxr;

	## Non-greedy match of everything up to the first <item> element:
	$header =~ m\|^(.*?)<item>\|s;

	## rewind file
	seek WXR, 0, 0;

	return $1 ? $1 : undef;
	}

	sub writechunk {
	my ($text, $outfile) = @_;

	open OUT, "> $outfile";
	print OUT $text;
	close OUT;

	}