Skip to content

Instantly share code, notes, and snippets.

@gonter
Last active August 9, 2021 05:26
Show Gist options
  • Save gonter/b6525395d3328ff32e6427d7cdbb406b to your computer and use it in GitHub Desktop.
Save gonter/b6525395d3328ff32e6427d7cdbb406b to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
use strict;
use Util::JSON;
use Data::Dumper;
$Data::Dumper::Indent= 1;
use FileHandle;
binmode(STDOUT, ':utf8');
my $fnm= shift(@ARGV);
my $fnm_out= 'test.xml';
my $record_type="Authority";
local *FO_xml;
if (defined ($fnm_out))
{
open (FO_xml, '>:utf8', $fnm_out) or die;
}
proc_marcbin($fnm);
exit(0);
sub proc_marcbin
{
my $fnm= shift;
my $marcbin= Util::JSON::read_text($fnm);
my @records= split(/\x1d/, $marcbin); # group separator
my $rec_num= 0;
if (defined ($fnm_out))
{
print FO_xml <<EOX;
<?xml version="1.0" encoding="UTF-8"?><collection xmlns="http://www.loc.gov/MARC21/slim">
EOX
}
foreach my $record (@records)
{
$rec_num++;
# print "record $rec_num\n";
my $record_xml= "<record type=\"$record_type\">\n";
# hex_dump ($record);
my @fields= split(/\x1e/, $record); # record separator
my $field_num= 0;
my @tags= ();
my ($tag, $x1, $x2, $x3);
FIELD: foreach my $field (@fields)
{
if ($field_num++ == 0)
{
if ($field =~ m#(........................)(.*)#)
{
my ($leader, $trailer)= ($1, $2);
print __LINE__, " leader=[$leader] trailer=[$trailer]\n";
$trailer =~ s#(...)#push(@tags, $1); ''#ge;
print __LINE__, " leader=[$leader] trailer=[$trailer]\n";
print __LINE__, " tags=[", join(' ', @tags), "]\n";
print __LINE__, " tag_count=[", scalar(@tags), "]\n";
$record_xml .= " <leader>$leader</leader>\n";
}
next FIELD;
}
($tag, $x1, $x2, $x3)= splice(@tags, 0, 4);
if ($tag+0 <= 9)
{
# hex_dump ($field);
# print __LINE__, " controlfield: tag=[$tag] x1=[$x1] x2=[$x2] x3=[$x3]\n";
$record_xml .= " <controlfield tag=\"$tag\" x1=\"$x1\" x2=\"$x2\" x3=\"$x3\">" . $field . "</controlfield>\n";
next FIELD;
}
# print "field $field_num\n";
my @items= split(/\x1f/, $field); # unit separator
my $item_num= 0;
ITEM: foreach my $item (@items)
{
$item_num++;
if ($item_num == 1)
{
if ($item =~ m#^(.)(.)$#)
{
my ($ind1, $ind2)= ($1, $2);
# print "tag=[$tag] ind1=[$ind1] ind2=[$ind2] x1=[$x1] x2=[$x2] x3=[$x3]\n";
$record_xml .= " <datafield tag=\"$tag\" ind1=\"$ind1\" ind2=\"$ind2\" x1=\"$x1\" x2=\"$x2\" x3=\"$x3\">\n";
next ITEM;
}
}
# print __LINE__, " item=[$item]\n";
if ($item=~ m#(.)(.*)#)
{
my ($code, $text)= ($1, $2);
$record_xml .= " <subfield code=\"$code\">$text</subfield>\n";
}
}
$record_xml .= " </datafield>\n";
}
print __LINE__, " remaining tags=[", join(' ', @tags), "]\n";
$record_xml .= "</record>\n";
if (defined ($fnm_out))
{
print FO_xml $record_xml;
}
else
{
print "record as xml: [\n", $record_xml, "]\n";
}
}
if (defined ($fnm_out))
{
print FO_xml <<EOX;
</collection>
EOX
}
}
# ----------------------------------------------------------------------------
sub hex_dump
{
my $data= shift;
local *FX= shift || *STDOUT;
my $off= 0;
my ($i, $c, $v);
my $run= 1;
DATA: while ($run)
{
my $char= '';
my $hex= '';
my $offx= sprintf ('%08X', $off);
for ($i= 0; $i < 16; $i++)
{
$c= substr ($data, $off+$i, 1);
if ($i == 8)
{
$hex .= ' ';
}
if ($c ne '')
{
# $data= substr ($data, 1);
$v= unpack ('C', $c);
$c= '.' if ($v < 0x20 || $v >= 0x7F);
$char .= $c;
$hex .= sprintf (' %02X', $v);
}
else
{
$char .= ' ';
$hex .= ' ';
$run= 0;
}
}
print FX "$offx $hex |$char|\n";
$off += 0x10;
}
}

What's the advantage of MARC in binary format vs. xml format, except that it's hard to read? Ok, space can be an issue, e.g. when processing GND dumps that can be found on dnb.de.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment