package MARC::File::HUNMARC; | |
=head1 NAME | |
MARC::File::HUNMARC - HUNMARC-specific file handling | |
=cut | |
use strict; | |
use integer; | |
use vars qw( $ERROR ); | |
use MARC::File::Encode qw( marc_to_utf8 ); | |
use MARC::File; | |
use vars qw( @ISA ); @ISA = qw( MARC::File ); | |
use MARC::Record qw( LEADER_LEN ); | |
use MARC::Field; | |
use constant SUBFIELD_INDICATOR => "\x1F"; | |
use constant END_OF_FIELD => "\n"; #"\x1E"; | |
use constant END_OF_RECORD => "\n\n"; #"\x1D"; | |
use constant DIRECTORY_ENTRY_LEN => 12; | |
=head1 SYNOPSIS | |
use MARC::File::HUNMARC; | |
my $file = MARC::File::HUNMARC->in( $filename ); | |
while ( my $marc = $file->next() ) { | |
# Do something | |
} | |
$file->close(); | |
undef $file; | |
=head1 EXPORT | |
None. | |
=head1 METHODS | |
=cut | |
sub _next { | |
my $self = shift; | |
my $fh = $self->{fh}; | |
my $reclen; | |
return if eof($fh); | |
my $usmarc; | |
my $line; | |
while ($line = <$fh>) | |
{ | |
$line =~ s/[\r\n]+$//; | |
last if (!$line); | |
$usmarc .= $line.END_OF_FIELD; | |
} | |
# remove illegal garbage that sometimes occurs between records | |
$usmarc =~ s/^[ \x00\x1a]+//; | |
return $usmarc; | |
} | |
=head2 decode( $string [, \&filter_func ] ) | |
Constructor for handling data from a HUNMARC file. This function takes care of | |
all the tag directory parsing & mangling. | |
Any warnings or coercions can be checked in the C<warnings()> function. | |
The C<$filter_func> is an optional reference to a user-supplied function | |
that determines on a tag-by-tag basis if you want the tag passed to it | |
to be put into the MARC record. The function is passed the tag number | |
and the raw tag data, and must return a boolean. The return of a true | |
value tells MARC::File::HUNMARC::decode that the tag should get put into | |
the resulting MARC record. | |
For example, if you only want title and subject tags in your MARC record, | |
try this: | |
sub filter { | |
my ($tagno,$tagdata) = @_; | |
return ($tagno == 245) || ($tagno >= 600 && $tagno <= 699); | |
} | |
my $marc = MARC::File::HUNMARC->decode( $string, \&filter ); | |
Why would you want to do such a thing? The big reason is that creating | |
fields is processor-intensive, and if your program is doing read-only | |
data analysis and needs to be as fast as possible, you can save time by | |
not creating fields that you'll be ignoring anyway. | |
Another possible use is if you're only interested in printing certain | |
tags from the record, then you can filter them when you read from disc | |
and not have to delete unwanted tags yourself. | |
=cut | |
sub decode { | |
my $text; | |
my $location = ''; | |
## decode can be called in a variety of ways | |
## $object->decode( $string ) | |
## MARC::File::HUNMARC->decode( $string ) | |
## MARC::File::HUNMARC::decode( $string ) | |
## this bit of code covers all three | |
my $self = shift; | |
if ( ref($self) =~ /^MARC::File/ ) { | |
$location = 'in record '.$self->{recnum}; | |
$text = shift; | |
} else { | |
$location = 'in record 1'; | |
$text = $self=~/MARC::File/ ? shift : $self; | |
} | |
my $filter_func = shift; | |
# ok this the empty shell we will fill | |
my $marc = MARC::Record->new(); | |
# HUNMARC specific | |
$text = substr($text, 4) if ($text =~ /^000 \d{5}/); | |
# Check for an all-numeric record length | |
($text =~ /^(\d{5})/) | |
or return $marc->_warn( "Record length \"", substr( $text, 0, 5 ), "\" is not numeric $location" ); | |
$marc->leader( substr( $text, 0, LEADER_LEN ) ); | |
# go through all the fields | |
my @lines = split END_OF_FIELD, $text; | |
shift @lines; | |
foreach my $line (@lines) | |
{ | |
# HUNMARC specific | |
if ($line =~ m/^(00\d) (.*)$/) | |
{ | |
my $field = MARC::Field->new($1, $2); | |
$field->{_data} = $2; # strange but necessary | |
$marc->append_fields($field); | |
next; | |
} | |
$line =~ m/^(\d\d\d) (.)(.) .(.*)$/; | |
my ($tagno, $ind1, $ind2, $tagdata) = ($1, $2, $3, $4); | |
# if utf8 the we encode the string as utf8 | |
if ( $marc->encoding() eq 'UTF-8' ) { | |
$tagdata = marc_to_utf8( $tagdata ); | |
} | |
if ( $filter_func ) { | |
next unless $filter_func->( $tagno, $tagdata ); | |
} | |
if ( MARC::Field->is_controlfield_tag($tagno) ) { | |
$marc->append_fields( MARC::Field->new( $tagno, $tagdata ) ); | |
} else { | |
my @subfields = split( SUBFIELD_INDICATOR, $tagdata ); | |
# Split the subfield data into subfield name and data pairs | |
my @subfield_data; | |
for ( @subfields ) { | |
if ( length > 0 ) { | |
push( @subfield_data, substr($_,0,1),substr($_,1) ); | |
} else { | |
$marc->_warn( "Entirely empty subfield found in tag $tagno" ); | |
} | |
} | |
if ( !@subfield_data ) { | |
$marc->_warn( "no subfield data found $location for tag $tagno" ); | |
next; | |
} | |
my $field = MARC::Field->new($tagno, $ind1, $ind2, @subfield_data ); | |
if ( $field->warnings() ) { | |
$marc->_warn( $field->warnings() ); | |
} | |
$marc->append_fields( $field ); | |
} | |
} # looping through all the fields | |
return $marc; | |
} | |
=head2 update_leader() | |
If any changes get made to the MARC record, the first 5 bytes of the | |
leader (the length) will be invalid. This function updates the | |
leader with the correct length of the record as it would be if | |
written out to a file. | |
=cut | |
sub update_leader() { | |
my $self = shift; | |
my (undef,undef,$reclen,$baseaddress) = $self->_build_tag_directory(); | |
$self->_set_leader_lengths( $reclen, $baseaddress ); | |
} | |
=head2 _build_tag_directory() | |
Function for internal use only: Builds the tag directory that gets | |
put in front of the data in a MARC record. | |
Returns two array references, and two lengths: The tag directory, and the data fields themselves, | |
the length of all data (including the Leader that we expect will be added), | |
and the size of the Leader and tag directory. | |
=cut | |
sub _build_tag_directory { | |
my $marc = shift; | |
$marc = shift if (ref($marc)||$marc) =~ /^MARC::File/; | |
die "Wanted a MARC::Record but got a ", ref($marc) unless ref($marc) eq "MARC::Record"; | |
my @fields; | |
my @directory; | |
my $dataend = 0; | |
for my $field ( $marc->fields() ) { | |
# Dump data into proper format | |
my $str = $field->as_usmarc; | |
push( @fields, $str ); | |
# Create directory entry | |
my $len = bytes::length( $str ); | |
my $direntry = sprintf( "%03s%04d%05d", $field->tag, $len, $dataend ); | |
push( @directory, $direntry ); | |
$dataend += $len; | |
} | |
my $baseaddress = | |
LEADER_LEN + # better be 24 | |
( @directory * DIRECTORY_ENTRY_LEN ) + | |
# all the directory entries | |
1; # end-of-field marker | |
my $total = | |
$baseaddress + # stuff before first field | |
$dataend + # Length of the fields | |
1; # End-of-record marker | |
return (\@fields, \@directory, $total, $baseaddress); | |
} | |
=head2 encode() | |
Returns a string of characters suitable for writing out to a HUNMARC file, | |
including the leader, directory and all the fields. | |
=cut | |
sub encode() { | |
my $marc = shift; | |
$marc = shift if (ref($marc)||$marc) =~ /^MARC::File/; | |
my ($fields,$directory,$reclen,$baseaddress) = _build_tag_directory($marc); | |
$marc->set_leader_lengths( $reclen, $baseaddress ); | |
# Glomp it all together | |
return join("",$marc->leader, @$directory, END_OF_FIELD, @$fields, END_OF_RECORD); | |
} | |
1; | |
__END__ | |
=head1 RELATED MODULES | |
L<MARC::Record> | |
=head1 TODO | |
Make some sort of autodispatch so that you don't have to explicitly | |
specify the MARC::File::X subclass, sort of like how DBI knows to | |
use DBD::Oracle or DBD::Mysql. | |
Create a toggle-able option to check inside the field data for | |
end of field characters. Presumably it would be good to have | |
it turned on all the time, but it's nice to be able to opt out | |
if you don't want to take the performance hit. | |
=head1 LICENSE | |
This code may be distributed under the same terms as Perl itself. | |
Please note that these modules are not products of or supported by the | |
employers of the various contributors to the code. | |
=head1 AUTHOR | |
After the original USMARC.pm by Andy Lester, C<< <andy@petdance.com> >> | |
adapted to HUNMARC by David Takacs, C<< <takdavid@gmail.com> >> | |
=cut |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment