bobular/post-huge-solr-json

## post-huge-solr-json
#!/usr/bin/env perl
# -*- mode: cperl -*-
#
# usage: bin/post-huge-json <opts> huge-file.json
#
# <opts> might be "-c ninjadata"
#
# splits a large file containing a single json array of json objects
# into smaller chunks of 100000, and posts them to Solr using bin/post <opts>
#
# assumes that the comma separating objects is at the end of the line
# and has no preceding information e.g. alphanumeric characters
#
#

use FindBin;

my $file = pop @ARGV;
my @opts = @ARGV;

my $tmpfile = "tmp_chunk_$$.json";

my $count = 0;
my $chunksize = 100000; # not a configurable option because for simplicity we pass all options to bin/post

open(CHUNK, ">$tmpfile") || die "can't open '$tmpfile' for writing";
open(FILE, $file) || die "can't open '$file' for reading\n";
while (<FILE>) {
  if (/^\W*,$/) {
    $count++;
    if ($count % $chunksize == 0) {

      # finish the chunk - end the array
      s/,$/]/;
      print CHUNK $_;
      close(CHUNK);

      post(@opts, $tmpfile);

      open(CHUNK, ">$tmpfile") || die "can't open '$tmpfile' for writing";
      print CHUNK "[\n";
    } else {
      print CHUNK $_;
    }
  } else {
    print CHUNK $_;
  }

}
close(FILE);
post(@opts, $tmpfile);
unlink $tmpfile;

sub post {
  system "$FindBin::Bin/post @_";
}
	#!/usr/bin/env perl
	# -- mode: cperl --
	#
	# usage: bin/post-huge-json <opts> huge-file.json
	#
	# <opts> might be "-c ninjadata"
	#
	# splits a large file containing a single json array of json objects
	# into smaller chunks of 100000, and posts them to Solr using bin/post <opts>
	#
	# assumes that the comma separating objects is at the end of the line
	# and has no preceding information e.g. alphanumeric characters
	#
	#

	use FindBin;

	my $file = pop @ARGV;
	my @opts = @ARGV;

	my $tmpfile = "tmp_chunk_$$.json";

	my $count = 0;
	my $chunksize = 100000; # not a configurable option because for simplicity we pass all options to bin/post

	open(CHUNK, ">$tmpfile") \|\| die "can't open '$tmpfile' for writing";
	open(FILE, $file) \|\| die "can't open '$file' for reading\n";
	while (<FILE>) {
	if (/^\W*,$/) {
	$count++;
	if ($count % $chunksize == 0) {

	# finish the chunk - end the array
	s/,$/]/;
	print CHUNK $_;
	close(CHUNK);

	post(@opts, $tmpfile);

	open(CHUNK, ">$tmpfile") \|\| die "can't open '$tmpfile' for writing";
	print CHUNK "[\n";
	} else {
	print CHUNK $_;
	}
	} else {
	print CHUNK $_;
	}

	}
	close(FILE);
	post(@opts, $tmpfile);
	unlink $tmpfile;

	sub post {
	system "$FindBin::Bin/post @_";
	}