Match authors against VIAF using Catmandu and Linked Data Fragments
#!/usr/bin/env perl | |
# | |
# Match authors against VIAF | |
# | |
# License: http://dev.perl.org/licenses/artistic.html | |
# | |
# Author: Patrick Hochstenbach <Patrick.Hochstenbach@UGent.be> | |
# | |
# Apr 2015 | |
$|++; | |
use Catmandu::Sane; | |
use Catmandu; | |
use Catmandu::Fix::Inline::marc_map qw(:all); | |
use RDF::LDF; | |
use Data::Dumper; | |
use Cache::LRU; | |
use Getopt::Long; | |
Catmandu->load(':up'); | |
my $store = 'rug01'; | |
my $type = 'ALEPHSEQ'; | |
my $fix = undef; | |
GetOptions("fix=s" => \$fix , "store=s" => \$store , "type=s" => \$type); | |
my $query = shift; | |
unless ($query) { | |
print STDERR <<EOF; | |
usage: $0 [--fix fix] [--type USMARC|JSON|XML|RAW|ALEPHSEQ] file | |
usage: $0 [--fix fix] [--store store] query | |
usage: $0 [--fix fix] [--store store] all | |
EOF | |
exit(1); | |
} | |
$query = undef if $query eq 'all'; | |
my $viaf_endpoint = 'http://data.linkeddatafragments.org/viaf'; | |
my $client = RDF::LDF->new(url => $viaf_endpoint); | |
my $cache = Cache::LRU->new(size => 10000); | |
my $iterator; | |
if (-r $query) { | |
$iterator = Catmandu->importer('MARC',file => $query, type => $type); | |
} | |
else { | |
$iterator = Catmandu->store($store)->bag->searcher(query => $query); | |
} | |
binmode(STDOUT,':encoding(UTF-8)'); | |
&do_import($fix,$query); | |
sub do_import { | |
my $fix = shift; | |
my $query = shift; | |
my $fixer; | |
$fixer = Catmandu->fixer($fix) if defined $fix; | |
my $n = $iterator->each(sub { | |
my $item = shift; | |
my $id = $item->{_id}; | |
my $record = $item->{record}; | |
my $marc; | |
my $found = 0; | |
for my $field (@$record) { | |
my ($tag,$ind1,$ind2,@data) = @$field; | |
next unless ($tag eq '100' || $tag eq '700'); | |
my $doc = { record => [['100',$ind1,$ind2,@data]] }; | |
my @aut = flat(marc_map($doc,'100ad' , -split => 1 , -pluck => 1)); | |
$marc .= &alephseq($id,@$field); | |
if (@aut == 0) { | |
$marc .= "\n"; | |
next; | |
} | |
my ($name,$date) = @aut; | |
$name =~ s/,$//; | |
if (defined $date && $date =~/^(\d{4}-(\d{4})?)$/) { | |
$date = $1; | |
} | |
else { | |
$marc .= "\n"; | |
next; | |
} | |
my @res = &get_viaf_id($name,$date); | |
if (@res == 1) { | |
my $uri = pop @res; | |
$marc .= "\t$uri\n"; | |
$found = 1; | |
} | |
else { | |
my $num = int(@res); | |
# more than one hit | |
$marc .= "\n"; | |
} | |
} | |
print $marc if $found; | |
}); | |
print STDERR "Processed $n rug01 records\n"; | |
} | |
sub alephseq { | |
my ($id,$tag,$ind1,$ind2,@data) = @_; | |
my $str = "$id $tag$ind1$ind2 L "; | |
for (my $i = 0 ; $i < @data ; $i += 2) { | |
if ($data[$i] eq '_') { | |
$str .= $data[$i+1]; | |
} | |
else { | |
$str .= "\$\$" . $data[$i] . $data[$i+1]; | |
} | |
} | |
$str; | |
} | |
sub get_viaf_id { | |
my ($name,$date) = @_; | |
my $key = "\"$name, $date\""; | |
if (defined(my $value = $cache->get($key))) { | |
return @$value; | |
} | |
else { | |
my $value = &ldf_query($key); | |
$cache->set($key => $value); | |
return @$value; | |
} | |
} | |
sub ldf_query { | |
my $object = shift; | |
my $it = $client->get_statements(undef, 'http://schema.org/alternateName', $object); | |
my @res = (); | |
while (my $st = $it->()) { | |
push @res, $st->subject->uri; | |
} | |
return \@res; | |
} | |
sub flat(@) { | |
return map { ref eq 'ARRAY' ? @$_ : $_ } @_; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment