Skip to content

Instantly share code, notes, and snippets.

@yokawasa
Created June 8, 2015 23:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yokawasa/f1cb68cd168f50dbf873 to your computer and use it in GitHub Desktop.
Save yokawasa/f1cb68cd168f50dbf873 to your computer and use it in GitHub Desktop.
Generating JSON Data for Azure Search from Wikipedia Database Dump File
#!/usr/bin/perl -w
use strict;
use XML::Twig;
use Getopt::Std;
use Encode;
use JSON;
my $UPLOAD_THRESHOLD=100;
my %opts=();
getopts('c:o:', \%opts) or die "Wrong Options!\n";
my $inputfile=$opts{'c'};
my $outputpath=$opts{'o'};
usage() if ( $inputfile eq '' || $outputpath eq '' );
my($tmpcounter,$itemcounter, $filecounter) = (0,0,0);
my $itemsarr = [];
my $twig = new XML::Twig(
twig_handlers => { doc => \&doc }
);
$twig->parsefile($inputfile);
if ($tmpcounter) {
flush2json($itemsarr, "items-$filecounter.json");
}
sub doc {
my($twig, $doc)= @_;
my $item = {
itemid => "$itemcounter",
title=> utf8string($doc->first_child('title')->text),
abstract=> utf8string($doc->first_child('abstract')->text),
url =>$doc->first_child('url')->text
};
push($itemsarr,$item);
$tmpcounter++; $itemcounter++;
if ($tmpcounter % $UPLOAD_THRESHOLD == 0 ){
flush2json($itemsarr, "items-$filecounter.json");
$tmpcounter=0;
$filecounter++;
$itemsarr=[];
}
$twig->purge;
}
sub flush2json {
my ($iarr, $f)=@_;
my $outarr->{'value'}=$iarr;
my $json = JSON->new();
my $js = $json->encode($outarr);
open(P, "> $outputpath/$f") or die "can't open: $outputpath/$f \n";
print P "$js";
close P;
}
sub usage {
print STDERR "Usage: $0 -c <inputxml> -o <outputpath>\n";
exit(1);
}
sub utf8string {
my $s= shift;
if (utf8::is_utf8($s)) {
return encode('utf-8', $s);
}
return $s;
}
__END__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment