Last active
August 29, 2015 14:23
-
-
Save bobular/de9da005ced2a5b60a01 to your computer and use it in GitHub Desktop.
Solr 5.x's bin/post fails with massive JSON files - so this is a way to split them up on the fly.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# -*- mode: cperl -*- | |
# | |
# usage: bin/post-huge-json <opts> huge-file.json | |
# | |
# <opts> might be "-c ninjadata" | |
# | |
# splits a large file containing a single json array of json objects | |
# into smaller chunks of 100000, and posts them to Solr using bin/post <opts> | |
# | |
# assumes that the comma separating objects is at the end of the line | |
# and has no preceding information e.g. alphanumeric characters | |
# | |
# | |
use FindBin; | |
my $file = pop @ARGV; | |
my @opts = @ARGV; | |
my $tmpfile = "tmp_chunk_$$.json"; | |
my $count = 0; | |
my $chunksize = 100000; # not a configurable option because for simplicity we pass all options to bin/post | |
open(CHUNK, ">$tmpfile") || die "can't open '$tmpfile' for writing"; | |
open(FILE, $file) || die "can't open '$file' for reading\n"; | |
while (<FILE>) { | |
if (/^\W*,$/) { | |
$count++; | |
if ($count % $chunksize == 0) { | |
# finish the chunk - end the array | |
s/,$/]/; | |
print CHUNK $_; | |
close(CHUNK); | |
post(@opts, $tmpfile); | |
open(CHUNK, ">$tmpfile") || die "can't open '$tmpfile' for writing"; | |
print CHUNK "[\n"; | |
} else { | |
print CHUNK $_; | |
} | |
} else { | |
print CHUNK $_; | |
} | |
} | |
close(FILE); | |
post(@opts, $tmpfile); | |
unlink $tmpfile; | |
sub post { | |
system "$FindBin::Bin/post @_"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment