Created
March 1, 2016 17:21
-
-
Save dougalcampbell/89d63c0848e0e3965711 to your computer and use it in GitHub Desktop.
A perl script to split WordPress WXR export files into multiple, smaller files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
# | |
# wxrsplit - Split a WordPress WXR file into multiple output files, each | |
# with a maximum filesize. | |
# | |
# NOTE: Because this tool attempts to keep items intact within each output | |
# file, it is possible to exceed the specified max filesize. Comments are | |
# contained within a post item, so a post with many comments could | |
# conceivably generate a very large item size. There probably is not a | |
# practical way around this. | |
# | |
# @author: Dougal Campbell <dougal@dougal.us> | |
# @license: MIT / GPL2 | |
use strict; | |
use Getopt::Long; | |
## Defaults | |
my $filename = 'output.wxr'; | |
# Can use 'K' for kilobytes or 'M' for megabytes (not case-sensitive) | |
my $size = '2M'; | |
my $help = 0; | |
# If we get this many bytes into the file without finding proof that it's | |
# a WXR, abort. Since the entire WXR export of a fresh WP 2.5 install, | |
# including the 'Hello, World' post and comment, is only about 5K, a | |
# value of 8K should be more than safe: | |
my $hdrsz = '8192'; | |
## Parse options | |
my $result = GetOptions('f=s' => \$filename, 's=s' => \$size, 'help' => \$help); | |
if ($help) { | |
die usage(); | |
} | |
## Options valid? | |
checkfile($filename) or warn "File '$filename' not found.\n" and die usage(); | |
my $filesize = parsesize($size) or warn "Filesize '$size' not recognized.\n" and die usage(); | |
open WXR, "< $filename" or die "Could not open '$filename' for reading.\n"; | |
my $header = getheader() or die "Could not parse header. Is this a WXR file?\n"; | |
my $headersize = length($header); | |
# input record separator: | |
$/ = '<item>'; | |
## Find the first item | |
seek WXR, $headersize, 0; | |
my $chunk = ''; | |
my $chunksize = 0; | |
my $i = 1; | |
my $file = ''; | |
while (<WXR>) { | |
chomp; | |
## first chunk is probably just whitespace between the channel info | |
## and the start of the first item. Skip it, if so: | |
next unless m|</item>|s; | |
my $item = "<item>\n" . $_; | |
my $itemsize = length($item); | |
($file = $filename) =~ s/\.wxr/-$i.wxr/; | |
if (($headersize + $chunksize + $itemsize) >= $filesize) { | |
writechunk($header . $chunk, $file); | |
$chunk = $item; | |
++$i; | |
} | |
$chunk .= $item; | |
$chunksize = length($chunk); | |
} | |
## Write final chunk. | |
writechunk($header . $chunk, $file) if $chunk; | |
print "Done.\n"; | |
###### | |
sub usage { | |
warn <<USAGE; | |
Usage: wxrsplit [opts] | |
Options: | |
-f filename (defaults to 'output.wxr') | |
-s SIZE (defaults to 2M) | |
Split a WXR file into multiple pieces, keeping each piece below a given | |
size. | |
USAGE | |
return "\n"; # suppress line number reporting from die() | |
} | |
sub checkfile { | |
my $file = shift; | |
if (-f $file) { | |
return $file; | |
} | |
return undef; | |
} | |
sub parsesize { | |
my $size = shift; | |
my $kilo = 1024; | |
my $meg = $kilo * $kilo; | |
my $mult = 1; | |
$size =~ m/^(\d+)(.?)$/; | |
my ($num, $unit) = ($1, $2); | |
$num -= 0 or return undef; | |
$unit = lc($unit); | |
if ($unit eq 'm') { | |
$mult = $meg; | |
} elsif ($unit eq 'k') { | |
$mult = $kilo; | |
} elsif ($unit) { | |
return undef; | |
} | |
my $filesize = $num * $mult; | |
return $filesize; | |
} | |
sub getheader { | |
my $bytes = 0; | |
my $header = ''; | |
$bytes = read(WXR, $header, $hdrsz); | |
## Is this really a WXR file? | |
my ($iswxr) = $header =~ m|xmlns:wp="http://wordpress[.]org/export/\d+[.]\d+/"|s; | |
return undef unless $iswxr; | |
## Non-greedy match of everything up to the first <item> element: | |
$header =~ m|^(.*?)<item>|s; | |
## rewind file | |
seek WXR, 0, 0; | |
return $1 ? $1 : undef; | |
} | |
sub writechunk { | |
my ($text, $outfile) = @_; | |
open OUT, "> $outfile"; | |
print OUT $text; | |
close OUT; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment